This server does not have OFED installed and it is in the loopback mode.

 

Found another server also in the loopback mode with ConnectX-3 and have OFED installed.

 

By the way, what SSD are you using? Maybe it is relating to the SSD? I’ve just run with 2048k for a short duration and seems no issue. Will run more time to see whether can hit this error.

 

[root@slave3 fio]# lspci | grep -i mell

08:00.0 Ethernet controller: Mellanox Technologies MT27520 Family [ConnectX-3 Pro]

 

[root@slave3 fio]# lsmod | grep -i mlx

mlx4_ib               159744  0

ib_core               208896  15 ib_iser,ib_cm,rdma_cm,ib_umad,ib_srp,nvme_rdma,ib_isert,ib_uverbs,rpcrdma,ib_ipoib,iw_cm,ib_srpt,ib_ucm,rdma_ucm,mlx4_ib

mlx4_en               114688  0

mlx4_core             307200  2 mlx4_en,mlx4_ib

ptp                    20480  3 ixgbe,igb,mlx4_en

 

[root@slave3 fio]# ofed_info

MLNX_OFED_LINUX-3.2-2.0.0.0 (OFED-3.2-2.0.0):

 

ar_mgr:

osm_plugins/ar_mgr/ar_mgr-1.0-0.30.ga1ea4b7.tar.gz

 

cc_mgr:

osm_plugins/cc_mgr/cc_mgr-1.0-0.29.ga1ea4b7.tar.gz

 

dapl:

dapl.git mlnx_ofed_3_1

commit c30fb6ce2cbc29d8ed4bde51437f7abb93378c78

dump_pr:

osm_plugins/dump_pr//dump_pr-1.0-0.25.ga1ea4b7.tar.gz

 

fabric-collector:

fabric_collector//fabric-collector-1.1.0.MLNX20140410.51b267e.tar.gz

 

fca:

mlnx_ofed_fca/fca-2.5.2431-1.src.rpm

 

hcoll:

mlnx_ofed_hcol/hcoll-3.4.807-1.src.rpm

 

ibacm:

mlnx_ofed/ibacm.git mlnx_ofed_3_2

commit 15ad8c13bdebbe62edea0b7df030710b65c14f7f

ibacm_ssa:

mlnx_ofed_ssa/acm/ibacm_ssa-0.0.9.3.MLNX20151203.50eb579.tar.gz

 

ibdump:

sniffer/sniffer-4.0.0-2/ibdump/linux/ibdump-4.0.0-2.tgz

 

ibsim:

mlnx_ofed_ibsim/ibsim-0.6-0.8.g9d76581.tar.gz

 

ibssa:

mlnx_ofed_ssa/distrib/ibssa-0.0.9.3.MLNX20151203.50eb579.tar.gz

 

ibutils:

ofed-1.5.3-rpms/ibutils/ibutils-1.5.7.1-0.12.gdcaeae2.tar.gz

 

ibutils2:

ibutils2/ibutils2-2.1.1-0.76.MLNX20160222.gd366c7b.tar.gz

 

infiniband-diags:

mlnx_ofed_infiniband_diags/infiniband-diags-1.6.6.MLNX20151130.7f0213e.tar.gz

 

infinipath-psm:

mlnx_ofed_infinipath-psm/infinipath-psm-3.3-2_g6f42cdb_open.tar.gz

 

iser:

mlnx_ofed/mlnx_rdma.git mlnx_ofed_3_2_2

commit 378ff029c77bac76cd02a8b89c6f3109bbb11c3d

 

isert:

mlnx_ofed/mlnx_rdma.git mlnx_ofed_3_2_2

commit 378ff029c77bac76cd02a8b89c6f3109bbb11c3d

 

kernel-mft:

mlnx_ofed_mft/kernel-mft-4.3.0-25.src.rpm

 

knem:

knem.git mellanox-master

commit f143ee19a575cd42a334422fa8bd329d671238db

libibcm:

mlnx_ofed/libibcm.git mlnx_ofed_3_0

commit d7d485df305e6536711485bd7e477668e77d8320

libibmad:

mlnx_ofed_libibmad/libibmad-1.3.12.MLNX20151122.d140cb1.tar.gz

 

libibprof:

mlnx_ofed_libibprof/libibprof-1.1.22-1.src.rpm

 

libibumad:

mlnx_ofed_libibumad/libibumad-1.3.10.2.MLNX20150406.966500d.tar.gz

 

libibverbs:

mlnx_ofed/libibverbs.git mlnx_ofed_3_2_2

commit 217a77686f4861229f0e4b94485a13f024634caf

libmlx4:

mlnx_ofed/libmlx4.git mlnx_ofed_3_2_2

commit dda6d7ae1e6e3779a485ebdd0a882f4bcbd027a6

libmlx5:

mlnx_ofed/libmlx5.git mlnx_ofed_3_2_2

commit d0c8645359e0f0aba0408b2d344f3b418d27019b

libopensmssa:

mlnx_ofed_ssa/plugin/libopensmssa-0.0.9.3.MLNX20151203.50eb579.tar.gz

 

librdmacm:

mlnx_ofed/librdmacm.git mlnx_ofed_3_2_2

commit 6bd430fed9e7b3d57a1876c040431ce7295c7703

libsdp:

libsdp.git mlnx_ofed_3_0

commit fbd01dfff05f42d6b82506e7dbf4bc6b7e6a59a4

libvma:

vma/source_rpms//libvma-7.0.14-0.src.rpm

 

mlnx-ethtool:

upstream/ethtool.git for-upstream

commit ac0cf295abe0c0832f0711fed66ab9601c8b2513

mlnx-ofa_kernel:

mlnx_ofed/mlnx_rdma.git mlnx_ofed_3_2_2

commit 378ff029c77bac76cd02a8b89c6f3109bbb11c3d

 

mpi-selector:

ofed-1.5.3-rpms/mpi-selector/mpi-selector-1.0.3-1.src.rpm

 

mpitests:

mlnx_ofed_mpitest/mpitests-3.2.17-e1c7f2f.src.rpm

 

mstflint:

mlnx_ofed_mstflint/mstflint-4.3.0-1.49.g9b9af70.tar.gz

 

multiperf:

mlnx_ofed_multiperf/multiperf-3.0-0.10.gda89e8c.tar.gz

 

mvapich2:

mlnx_ofed_mvapich2/mvapich2-2.2a-1.src.rpm

 

mxm:

mlnx_ofed_mxm/mxm-3.4.3079-1.src.rpm

 

ofed-docs:

docs.git mlnx_ofed-3.2

commit ea3386416f9f7130edd2c70fc3424cb2cda50f7d

 

openmpi:

mlnx_ofed_ompi_1.8/openmpi-1.10.3a1-1.src.rpm

 

opensm:

mlnx_ofed_opensm/opensm-4.6.1.MLNX20160112.774e977.tar.gz

 

perftest:

mlnx_ofed_perftest/perftest-3.0-0.18.gb464d59.tar.gz

 

qperf:

mlnx_ofed_qperf/qperf-0.4.9.tar.gz

 

rds-tools:

rds-tools.git mlnx_ofed_2_4

commit 299420ca25cf9996bc0748e3bc4b08748996ba49

sdpnetstat:

sdpnetstat.git mlnx_ofed_3_0

commit 3cf409a7cc07e5c71f9640eddbb801ece21b4169

sockperf:

sockperf/sockperf-2.7-43.git3ee62bd8107a.src.rpm

 

srp:

mlnx_ofed/mlnx_rdma.git mlnx_ofed_3_2_2

commit 378ff029c77bac76cd02a8b89c6f3109bbb11c3d

 

srptools:

srptools/srptools-1.0.2-12.src.rpm

 

 

Installed Packages:

-------------------

infiniband-diags

librdmacm

libmlx4

libibverbs-utils

mpi-selector

libibmad-devel

sdpnetstat

knem

libibumad-devel

libsdp

mlnx-ethtool

libibverbs-debuginfo

mlnx-ofa_kernel-modules

srp

opensm

mstflint

cc_mgr

libibmad

libibverbs

kernel-mft

libibverbs-devel-static

libibumad

librdmacm-devel

mlnx-ofa_kernel

libsdp-devel

ibutils2

mlnxofed-docs

libibmad-static

iser

opensm-devel

dump_pr

libibumad-static

rds-tools

libmlx4-debuginfo

mlnx-ofa_kernel-devel

opensm-libs

opensm-static

ar_mgr

dapl-devel-static

infiniband-diags-compat

libibverbs-devel

ibsim

 

[root@slave3 fio]# ibstat

CA 'mlx4_0'

        CA type: MT4103

        Number of ports: 2

        Firmware version: 2.35.5100

        Hardware version: 0

        Node GUID: 0x248a0703006090e0

        System image GUID: 0x248a0703006090e0

        Port 1:

                State: Active

                Physical state: LinkUp

                Rate: 40

                Base lid: 0

                LMC: 0

                SM lid: 0

                Capability mask: 0x04010000

                Port GUID: 0x268a07fffe6090e0

                Link layer: Ethernet

        Port 2:

                State: Active

                Physical state: LinkUp

                Rate: 40

                Base lid: 0

                LMC: 0

                SM lid: 0

                Capability mask: 0x04010000

                Port GUID: 0x268a07fffe6090e1

                Link layer: Ethernet

 

[root@slave3 fio]# ibv_devinfo

hca_id: mlx4_0

        transport:                      InfiniBand (0)

        fw_ver:                         2.35.5100

        node_guid:                      248a:0703:0060:90e0

        sys_image_guid:                 248a:0703:0060:90e0

        vendor_id:                      0x02c9

        vendor_part_id:                 4103

        hw_ver:                         0x0

        board_id:                       MT_1090111023

        phys_port_cnt:                  2

                port:   1

                        state:                  PORT_ACTIVE (4)

                        max_mtu:                4096 (5)

                        active_mtu:             1024 (3)

                        sm_lid:                 0

                        port_lid:               0

                        port_lmc:               0x00

                        link_layer:             Ethernet

 

                port:   2

                        state:                  PORT_ACTIVE (4)

                        max_mtu:                4096 (5)

                        active_mtu:             1024 (3)

                        sm_lid:                 0

                        port_lid:               0

                        port_lmc:               0x00

                        link_layer:             Ethernet

 

From: Victor Banh [mailto:victorb@mellanox.com]
Sent: Thursday, October 19, 2017 12:34 PM
To: Storage Performance Development Kit <spdk@lists.01.org>; Harris, James R <james.r.harris@intel.com>; Cao, Gang <gang.cao@intel.com>
Subject: RE: [SPDK] Buffer I/O error on bigger block size running fio

 

Do you install Mellanox OFED on the Target and Client server?

Can you run ibstat on both servers?

Thanks

Victor


From: Cao, Gang <gang.cao@intel.com>
Sent: Wednesday, October 18, 2017 8:59:28 PM
To: Victor Banh; Storage Performance Development Kit; Harris, James R
Subject: RE: [SPDK] Buffer I/O error on bigger block size running fio

 

Hi Victor,

 

I’ve just tried the SPDK v17.07.1 and DPDK v17.08.

 

nvme version: 1.1.38.gfaab

fio version: 3.1

 

Tried the 512k and 1024k IO size and there is no error. demsg information as following.

 

So that there may be other difference here? Looks like you are using ConnectX-5 while I am using ConnectX-4?

 

Other related information:

 

[root@node4 fio]# lspci | grep -i mell

81:00.0 Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx]

81:00.1 Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx]

 

[root@node4 fio]# lsmod | grep -i mlx

mlx5_ib               172032  0

ib_core               200704  15 ib_iser,ib_cm,rdma_cm,ib_umad,ib_srp,nvme_rdma,ib_isert,ib_uverbs,rpcrdma,ib_ipoib,iw_cm,mlx5_ib,ib_srpt,ib_ucm,rdma_ucm

mlx5_core             380928  1 mlx5_ib

ptp                    20480  3 ixgbe,igb,mlx5_core

 

[root@node4 fio]# uname -a

Linux node4 4.10.1 #1 SMP Fri Mar 10 15:59:57 CST 2017 x86_64 x86_64 x86_64 GNU/Linux

 

[577707.543326] nvme nvme0: new ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery", addr 192.168.100.8:4420

[577730.854540] detected loopback device

[577730.893761] nvme nvme0: creating 7 I/O queues.

[577730.893797] detected loopback device

[577730.898611] detected loopback device

[577730.908917] detected loopback device

[577730.919073] detected loopback device

[577730.928922] detected loopback device

[577730.938679] detected loopback device

[577730.948365] detected loopback device

[577731.146290] nvme nvme0: new ctrl: NQN "nqn.2016-06.io.spdk:cnode2", addr 192.168.100.8:4420

 

Thanks,

Gang

 

From: Victor Banh [mailto:victorb@mellanox.com]
Sent: Thursday, October 19, 2017 9:43 AM
To: Cao, Gang <gang.cao@intel.com>; Storage Performance Development Kit <spdk@lists.01.org>; Harris, James R <james.r.harris@intel.com>
Subject: RE: [SPDK] Buffer I/O error on bigger block size running fio

 

Hi Gang

Any update?

Do you see any error message from “dmesg” with 512k block size running fio?

Thanks

Victor

 

From: Victor Banh
Sent: Tuesday, October 17, 2017 7:37 PM
To: 'Cao, Gang' <gang.cao@intel.com>; Storage Performance Development Kit <spdk@lists.01.org>; Harris, James R <james.r.harris@intel.com>
Subject: RE: [SPDK] Buffer I/O error on bigger block size running fio

 

Hi Gang

spdk-17.07.1 and dpdk-17.08

Thanks

Victor

 

From: Cao, Gang [mailto:gang.cao@intel.com]
Sent: Monday, October 16, 2017 8:51 PM
To: Victor Banh <victorb@mellanox.com>; Storage Performance Development Kit <spdk@lists.01.org>; Harris, James R <james.r.harris@intel.com>
Subject: RE: [SPDK] Buffer I/O error on bigger block size running fio

 

Hi Victor,

 

Could you share us which version of SPDK you are using when seeing this error? Or maybe you can have a try with the latest SPDK code?

 

Thanks,

Gang

 

From: Victor Banh [mailto:victorb@mellanox.com]
Sent: Tuesday, October 17, 2017 5:30 AM
To: Cao, Gang <gang.cao@intel.com>; Storage Performance Development Kit <spdk@lists.01.org>; Harris, James R <james.r.harris@intel.com>
Subject: RE: [SPDK] Buffer I/O error on bigger block size running fio

 

Hi Cao

Do you see any message from dmesg?

 

I tried this fio version and still saw these error message from dmesg.

 

fio-3.1

 

[869053.218235] Buffer I/O error on dev nvme2n1, logical block 0, async page read

[869053.218250] Buffer I/O error on dev nvme2n1, logical block 0, async page read

[869053.218259] Buffer I/O error on dev nvme2n1, logical block 0, async page read

[869053.218263] ldm_validate_partition_table(): Disk read failed.

[869053.218269] Buffer I/O error on dev nvme2n1, logical block 0, async page read

[869053.218277] Buffer I/O error on dev nvme2n1, logical block 0, async page read

[869053.218285] Buffer I/O error on dev nvme2n1, logical block 0, async page read

[869053.218292] Buffer I/O error on dev nvme2n1, logical block 0, async page read

[869053.218296] Dev nvme2n1: unable to read RDB block 0

[869053.218303] Buffer I/O error on dev nvme2n1, logical block 0, async page read

[869053.218311] Buffer I/O error on dev nvme2n1, logical block 0, async page read

[869053.218323] Buffer I/O error on dev nvme2n1, logical block 3, async page read

[869053.218338]  nvme2n1: unable to read partition table

[869053.246126] nvme2n1: detected capacity change from -62111005559226368 to -62042256479092736

[869053.246195] ldm_validate_partition_table(): Disk read failed.

[869053.246217] Dev nvme2n1: unable to read RDB block 0

 

From: Cao, Gang [mailto:gang.cao@intel.com]
Sent: Monday, October 09, 2017 10:59 AM
To: Storage Performance Development Kit <spdk@lists.01.org>; Harris, James R <james.r.harris@intel.com>
Cc: Victor Banh <victorb@mellanox.com>
Subject: RE: [SPDK] Buffer I/O error on bigger block size running fio

 

Hi Victor,

 

Thanks for your detailed information on the testing.

 

I’ve tried the latest SPDK code and with latest fio-3.1-20-g132b and fio-2.19. It seems like no this kind of error.

 

Could you share us which version of SPDK you are using when seeing this error? Or maybe you can have a try with the latest SPDK code?

 

fio --bs=512k --numjobs=4 --iodepth=16 --loops=1 --ioengine=libaio --direct=1 --invalidate=1 --fsync_on_close=1 --randrepeat=1 --norandommap --time_based --runtime=60 --filename=/dev/nvme0n1  --name=read-phase --rw=randwrite

read-phase: (g=0): rw=randwrite, bs=(R) 512KiB-512KiB, (W) 512KiB-512KiB, (T) 512KiB-512KiB, ioengine=libaio, iodepth=16

...

fio-3.1-20-g132b

Starting 4 processes

Jobs: 4 (f=4): [w(4)][100.0%][r=0KiB/s,w=1592MiB/s][r=0,w=3183 IOPS][eta 00m:00s]

read-phase: (groupid=0, jobs=1): err= 0: pid=46378: Tue Oct 10 01:23:39 2017

 

My NIC information:

[root@node4 nvme-cli-gerrit]# lsmod | grep -i mlx

mlx5_ib               172032  0

ib_core               200704  15 ib_iser,ib_cm,rdma_cm,ib_umad,ib_srp,nvme_rdma,ib_isert,ib_uverbs,rpcrdma,ib_ipoib,iw_cm,mlx5_ib,ib_srpt,ib_ucm,rdma_ucm

mlx5_core             380928  1 mlx5_ib

ptp                    20480  3 ixgbe,igb,mlx5_core

[root@node4 nvme-cli-gerrit]# lspci | grep -i mell

81:00.0 Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx]

81:00.1 Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx]

 

From: SPDK [mailto:spdk-bounces@lists.01.org] On Behalf Of Victor Banh
Sent: Friday, October 6, 2017 2:41 PM
To: Harris, James R <james.r.harris@intel.com>; Storage Performance Development Kit <spdk@lists.01.org>
Subject: Re: [SPDK] Buffer I/O error on bigger block size running fio

 

 

 

From: Harris, James R [mailto:james.r.harris@intel.com]
Sent: Friday, October 06, 2017 2:32 PM
To: Storage Performance Development Kit <spdk@lists.01.org>
Cc: Victor Banh <victorb@mellanox.com>
Subject: Re: [SPDK] Buffer I/O error on bigger block size running fio

 

(cc Victor)

 

From: James Harris <james.r.harris@intel.com>
Date: Thursday, October 5, 2017 at 1:59 PM
To: Storage Performance Development Kit <
spdk@lists.01.org>
Subject: Re: [SPDK] Buffer I/O error on bigger block size running fio

 

Hi Victor,

 

Could you provide a few more details?  This will help the list to provide some ideas.

 

1)      On the client, are you using the SPDK NVMe-oF initiator or the kernel initiator?

 

Kernel initiator, run these commands on client server.

 

modprobe mlx5_ib

modprobe nvme-rdma

nvme discover -t rdma -a 192.168.10.11 -s 4420

nvme connect -t rdma -n nqn.2016-06.io.spdk:nvme-subsystem-1  -a 192.168.10.11 -s 4420

 

 

2)      Can you provide the fio configuration file or command line?  Just so we can have more specifics on “bigger block size”.

 

fio --bs=512k --numjobs=4 --iodepth=16 --loops=1 --ioengine=libaio --direct=1 --invalidate=1 --fsync_on_close=1 --randrepeat=1 --norandommap --time_based --runtime=60 --filename=/dev/nvme1n1  --name=read-phase --rw=randwrite

 

3)      Any details on the HW setup – specifically details on the RDMA NIC (or if you’re using SW RoCE).

 

Nvmf.conf on target server

 

[Global]

  Comment "Global section"

    ReactorMask 0xff00

 

[Rpc]

  Enable No

  Listen 127.0.0.1

 

[Nvmf]

  MaxQueuesPerSession 8

  MaxQueueDepth 128

 

[Subsystem1]

  NQN nqn.2016-06.io.spdk:nvme-subsystem-1

  Core 9

  Mode Direct

  Listen RDMA 192.168.10.11:4420

  NVMe 0000:82:00.0

  SN S2PMNAAH400039

 

 

           It is RDMA NIC, ConnectX 5, Intel CPU Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz

NUMA node0 CPU(s):     0-7

NUMA node1 CPU(s):     8-15

 

 

 

 

Thanks,

 

-Jim

 

 

From: SPDK <spdk-bounces@lists.01.org> on behalf of Victor Banh <victorb@mellanox.com>
Reply-To: Storage Performance Development Kit <
spdk@lists.01.org>
Date: Thursday, October 5, 2017 at 11:26 AM
To: "
spdk@lists.01.org" <spdk@lists.01.org>
Subject: [SPDK] Buffer I/O error on bigger block size running fio

 

Hi

I have SPDK NVMeoF and keep getting error with bigger block size with fio on randwrite tests.

I am using Ubuntu 16.04 with kernel version 4.12.0-041200-generic on target and client.

The DPDK is 17.08 and SPDK is 17.07.1.

Thanks

Victor

 

 

[46905.233553] perf: interrupt took too long (2503 > 2500), lowering kernel.perf_event_max_sample_rate to 79750

[48285.159186] blk_update_request: I/O error, dev nvme1n1, sector 2507351968

[48285.159207] blk_update_request: I/O error, dev nvme1n1, sector 1301294496

[48285.159226] blk_update_request: I/O error, dev nvme1n1, sector 1947371168

[48285.159239] blk_update_request: I/O error, dev nvme1n1, sector 1891797568

[48285.159252] blk_update_request: I/O error, dev nvme1n1, sector 10833824

[48285.159265] blk_update_request: I/O error, dev nvme1n1, sector 614937152

[48285.159277] blk_update_request: I/O error, dev nvme1n1, sector 1872305088

[48285.159290] blk_update_request: I/O error, dev nvme1n1, sector 1504491040

[48285.159299] blk_update_request: I/O error, dev nvme1n1, sector 1182136128

[48285.159308] blk_update_request: I/O error, dev nvme1n1, sector 1662985792

[48285.191185] nvme nvme1: Reconnecting in 10 seconds...

[48285.191254] Buffer I/O error on dev nvme1n1, logical block 0, async page read

[48285.191291] Buffer I/O error on dev nvme1n1, logical block 0, async page read

[48285.191305] Buffer I/O error on dev nvme1n1, logical block 0, async page read

[48285.191314] ldm_validate_partition_table(): Disk read failed.

[48285.191320] Buffer I/O error on dev nvme1n1, logical block 0, async page read

[48285.191327] Buffer I/O error on dev nvme1n1, logical block 0, async page read

[48285.191335] Buffer I/O error on dev nvme1n1, logical block 0, async page read

[48285.191342] Buffer I/O error on dev nvme1n1, logical block 0, async page read

[48285.191347] Dev nvme1n1: unable to read RDB block 0

[48285.191353] Buffer I/O error on dev nvme1n1, logical block 0, async page read

[48285.191360] Buffer I/O error on dev nvme1n1, logical block 0, async page read

[48285.191375] Buffer I/O error on dev nvme1n1, logical block 3, async page read

[48285.191389]  nvme1n1: unable to read partition table

[48285.223197] nvme1n1: detected capacity change from 1600321314816 to 0

[48289.623192] nvme1n1: detected capacity change from 0 to -65647705833078784

[48289.623411] ldm_validate_partition_table(): Disk read failed.

[48289.623447] Dev nvme1n1: unable to read RDB block 0

[48289.623486]  nvme1n1: unable to read partition table

[48289.643305] ldm_validate_partition_table(): Disk read failed.

[48289.643328] Dev nvme1n1: unable to read RDB block 0

[48289.643373]  nvme1n1: unable to read partition table