【标题描述】能够简要描述问题:说明什么场景下,做了什么操作,出现什么问题(尽量使用正向表达方式)
并发执行多个perftest进程后global复位后挂死
一、缺陷信息
内核信息:
kernel 5.10
缺陷归属组件:
NA
缺陷归属的版本:
openeuler-lts-sp3-rc2
缺陷简述:
组好bond后,不能添加新的slave设备
【环境信息】
硬件信息
裸机场景请提供问题的硬件信息
鲲鹏920
虚拟机场景请提供虚拟机的XML文件或配置信息
NA
软件信息
OS版本及分支信息
openeuler sp3 rc2
内核信息
openEuler (5.10.0-171.0.0.84.oe2203sp3.aarch64) 22.03 (LTS-SP3)
发现问题的组件版本信息
网络信息
如果有特殊组网,请提供网络拓扑信息
NA
【问题复现步骤】,请描述具体的操作步骤
ib_write_bw -d hns_3 --run_infinitely -p 18889 &
ib_write_bw -d hns_3 --run_infinitely -p 18890 &
ib_write_bw -d hns_3 --run_infinitely -p 18891 &
ib_write_bw -d hns_3 --run_infinitely -p 18892 &
ib_write_bw -d hns_3 --run_infinitely -p 18889 127.0.0.1 &
ib_write_bw -d hns_3 --run_infinitely -p 18890 127.0.0.1 &
ib_write_bw -d hns_3 --run_infinitely -p 18891 127.0.0.1 &
ib_write_bw -d hns_3 --run_infinitely -p 18892 127.0.0.1 &
ethtool --reset eth8 all
预期:
复位完成后,系统无异常
【实际结果】,请描述出问题的结果和影响
系统报call trace挂死
【其他相关附件信息】
比如系统message日志/组件日志、dump信息、图片等
NA
缺陷详情参考链接:
NA
缺陷分析指导链接:
NA
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。
[root@localhost ~]#
[root@localhost ~]# ib_write_bw -d roceo1 --run_infinitely -p 18889 &
[1] 56709
[root@localhost ~]#
************************************
* Waiting for client to connect... *
************************************
ib_write_bw -d roceo1 --run_infinitely -p 18890 &
[2] 56710
[root@localhost ~]#
************************************
* Waiting for client to connect... *
************************************
ib_write_bw -d roceo1 --run_infinitely -p 18891 &
[3] 56711
[root@localhost ~]#
************************************
* Waiting for client to connect... *
************************************
ib_write_bw -d roceo1 --run_infinitely -p 18892 &
[4] 56712
[root@localhost ~]#
************************************
* Waiting for client to connect... *
************************************
ib_write_bw -d roceo1 --run_infinitely -p 18889 192.168.1.20&
[5] 56713
[root@localhost ~]# ---------------------------------------------------------------------------------------
RDMA_Write BW Test
---------------------------------------------------------------------------------------
Dual-port : OFF Device : roceo1
RDMA_Write BW Test
Number of qps : 1 Transport type : IB
Dual-port : OFF Device : roceo1
Connection type : RC Using SRQ : OFF
Number of qps : 1 Transport type : IB
PCIe relax order: ON
Connection type : RC Using SRQ : OFF
PCIe relax order: ON
ibv_wr* API : ON
ibv_wr* API : ON
CQ Moderation : 1
TX depth : 128
Mtu : 1024[B]
CQ Moderation : 1
Link type : Ethernet
Mtu : 1024[B]
GID index : 3
Link type : Ethernet
Max inline data : 0[B]
GID index : 3
rdma_cm QPs : OFF
Max inline data : 0[B]
Data ex. method : Ethernet
rdma_cm QPs : OFF
---------------------------------------------------------------------------------------
Data ex. method : Ethernet
---------------------------------------------------------------------------------------
local address: LID 0000 QPN 0x000c PSN 0xd0beec RKey 0x000400 VAddr 0x00ffffa3885000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
local address: LID 0000 QPN 0x000e PSN 0x2a6284 RKey 0x000300 VAddr 0x00ffff8cdbc000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
remote address: LID 0000 QPN 0x000e PSN 0x2a6284 RKey 0x000300 VAddr 0x00ffff8cdbc000
remote address: LID 0000 QPN 0x000c PSN 0xd0beec RKey 0x000400 VAddr 0x00ffffa3885000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
---------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
ib_write_bw -d roceo1 --run_infinitely -p 18890 192.168.1.20&
[6] 56717
[root@localhost ~]# ib_write_bw -d---------------------------------------------------------------------------------------
RDMA_Write BW Test
Dual-port : OFF Device : roceo1
Number of qps : 1 Transport type : IB
Connection type : RC Using SRQ : OFF
PCIe relax order: ON
---------------------------------------------------------------------------------------
RDMA_Write BW Test
Dual-port : OFF Device : roceo1
Number of qps : 1 Transport type : IB
Connection type : RC Using SRQ : OFF
PCIe relax order: ON
ibv_wr* API : ON
ibv_wr* API : ON
TX depth : 128
CQ Moderation : 1
CQ Moderation : 1
Mtu : 1024[B]
Mtu : 1024[B]
Link type : Ethernet
Link type : Ethernet
GID index : 3
GID index : 3
Max inline data : 0[B]
Max inline data : 0[B]
rdma_cm QPs : OFF
rdma_cm QPs : OFF
Data ex. method : Ethernet
Data ex. method : Ethernet
---------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------
local address: LID 0000 QPN 0x002a PSN 0x93dde0 RKey 0x000600 VAddr 0x00ffff9f7e9000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
local address: LID 0000 QPN 0x0008 PSN 0xeb0925 RKey 0x000500 VAddr 0x00ffffadfff000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
remote address: LID 0000 QPN 0x0008 PSN 0xeb0925 RKey 0x000500 VAddr 0x00ffffadfff000
remote address: LID 0000 QPN 0x002a PSN 0x93dde0 RKey 0x000600 VAddr 0x00ffff9f7e9000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
---------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
roceo1 --run_infinitely -p 18891 192.168.1.20&
[7] 56719
[root@localhost ~]# ib_write_bw -d ---------------------------------------------------------------------------------------
RDMA_Write BW Test
Dual-port : OFF Device : roceo1
---------------------------------------------------------------------------------------
Number of qps : 1 Transport type : IB
RDMA_Write BW Test
Connection type : RC Using SRQ : OFF
Dual-port : OFF Device : roceo1
PCIe relax order: ON
Number of qps : 1 Transport type : IB
Connection type : RC Using SRQ : OFF
PCIe relax order: ON
ibv_wr* API : ON
ibv_wr* API : ON
CQ Moderation : 1
TX depth : 128
Mtu : 1024[B]
CQ Moderation : 1
Link type : Ethernet
Mtu : 1024[B]
GID index : 3
Link type : Ethernet
Max inline data : 0[B]
GID index : 3
rdma_cm QPs : OFF
Max inline data : 0[B]
Data ex. method : Ethernet
rdma_cm QPs : OFF
---------------------------------------------------------------------------------------
Data ex. method : Ethernet
---------------------------------------------------------------------------------------
local address: LID 0000 QPN 0x000f PSN 0x30c9ae RKey 0x000800 VAddr 0x00ffff9e2a3000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
local address: LID 0000 QPN 0x000d PSN 0xb277ae RKey 0x000700 VAddr 0x00ffff9f6b6000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
remote address: LID 0000 QPN 0x000d PSN 0xb277ae RKey 0x000700 VAddr 0x00ffff9f6b6000
remote address: LID 0000 QPN 0x000f PSN 0x30c9ae RKey 0x000800 VAddr 0x00ffff9e2a3000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
roceo1 --run_infinitely -p 18892 192.168.1.20&
[8] 56721
[root@localhost ~]# 65536 61022 0.00 762.77 0.012204
---------------------------------------------------------------------------------------
RDMA_Write BW Test
Dual-port : OFF Device : roceo1
Number of qps : 1 Transport type : IB
Connection type : RC Using SRQ : OFF
---------------------------------------------------------------------------------------
PCIe relax order: ON
RDMA_Write BW Test
Dual-port : OFF Device : roceo1
Number of qps : 1 Transport type : IB
Connection type : RC Using SRQ : OFF
PCIe relax order: ON
ibv_wr* API : ON
ibv_wr* API : ON
CQ Moderation : 1
TX depth : 128
Mtu : 1024[B]
CQ Moderation : 1
Link type : Ethernet
Mtu : 1024[B]
GID index : 3
Link type : Ethernet
Max inline data : 0[B]
GID index : 3
rdma_cm QPs : OFF
Max inline data : 0[B]
Data ex. method : Ethernet
rdma_cm QPs : OFF
---------------------------------------------------------------------------------------
Data ex. method : Ethernet
---------------------------------------------------------------------------------------
local address: LID 0000 QPN 0x002b PSN 0xa78256 RKey 0x000a00 VAddr 0x00ffffa9818000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
local address: LID 0000 QPN 0x0009 PSN 0x2d7feb RKey 0x000900 VAddr 0x00ffff83e71000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
remote address: LID 0000 QPN 0x0009 PSN 0x2d7feb RKey 0x000900 VAddr 0x00ffff83e71000
remote address: LID 0000 QPN 0x002b PSN 0xa78256 RKey 0x000a00 VAddr 0x00ffffa9818000
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:20
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
65536 31605 0.00 395.06 0.006321
65536 24109 0.00 301.35 0.004822
ethtool --reset eno1 all
ETHTOOL_RESET 0xffffffff
Components reset: 0xffffffff
[root@localhost ~]# 65536 22470 0.00 280.87 0.004494
Completion with error at client
Completion with error at client
Failed status 5: wr_id 0 syndrom 0x0
Failed status 5: wr_id 0 syndrom 0x0
scnt=30200, ccnt=30200
scnt=85925, ccnt=85925
Error occurred while running infinitely! aborting ...
Error occurred while running infinitely! aborting ...
Completion with error at client
Completion with error at client
Failed status 5: wr_id 0 syndrom 0x0
Failed status 5: wr_id 0 syndrom 0x0
scnt=20284, ccnt=20284
scnt=45186, ccnt=45186
Error occurred while running infinitely! aborting ...
Error occurred while running infinitely! aborting ...
ethernet_read_keys: Couldn't read remote address
ethernet_read_keys: Couldn't read remote address
Unable to read to socket/rdma_cm
Unable to read to socket/rdma_cm
Failed to exchange data between server and clients
Failed to exchange data between server and clients
ethernet_read_keys: Couldn't read remote address
Unable to read to socket/rdma_cm
Failed to exchange data between server and clients
ethernet_read_keys: Couldn't read remote address
Unable to read to socket/rdma_cm
Failed to exchange data between server and clients
[1] Exit 1 ib_write_bw -d roceo1 --run_infinitely -p 18889
[2] Exit 1 ib_write_bw -d roceo1 --run_infinitely -p 18890
[3] Exit 1 ib_write_bw -d roceo1 --run_infinitely -p 18891
[4] Exit 1 ib_write_bw -d roceo1 --run_infinitely -p 18892
[5] Exit 1 ib_write_bw -d roceo1 --run_infinitely -p 18889 192.168.1.20
[6] Exit 1 ib_write_bw -d roceo1 --run_infinitely -p 18890 192.168.1.20
[7]- Exit 1 ib_write_bw -d roceo1 --run_infinitely -p 18891 192.168.1.20
[8]+ Exit 1 ib_write_bw -d roceo1 --run_infinitely -p 18892 192.168.1.20
[root@localhost ~]#
[root@localhost ~]#
[root@localhost ~]#
[root@localhost ~]#
[root@localhost ~]#
[root@localhost ~]# dmesg
[ 252.044098] usb 1-1.3: USB disconnect, device number 6
[51773.461978] hns3 0000:7d:00.0 eno1: Setting reset type 6
[51773.461984] hns3 0000:7d:00.0: received reset event, reset type is 6
[51773.461994] hns3 0000:7d:00.0: global reset requested
[51773.462001] hns3 0000:7d:00.1: global reset interrupt
[51773.462002] hns3 0000:7d:00.2: global reset interrupt
[51773.462003] hns3 0000:7d:00.0: global reset interrupt
[51773.462006] hns3 0000:7d:00.3: global reset interrupt
[51773.462089] hns3 0000:7d:00.2 eno3: link down
[51773.462156] hns3 0000:7d:00.1 eno2: link down
[51773.495724] hns3 0000:7d:00.0: cleaned 0, need to clean 1
[51773.495730] hns3 0000:7d:00.0: get link status cmd failed -52
[51773.512417] hns3 0000:7d:00.3 eno4: link down
[51773.512432] hns3 0000:7d:00.2: get sfp info failed -16
[51773.518798] hns3 0000:7d:00.0 eno1: link down
[51773.518812] hns3 0000:7d:00.0: get sfp info failed -16
[51773.566765] hns3 0000:7d:00.1: prepare wait ok
[51773.566768] hns3 0000:7d:00.2: prepare wait ok
[51773.618762] hns3 0000:7d:00.3: prepare wait ok
[51773.626762] hns3 0000:7d:00.0: prepare wait ok
[51773.674761] hns3 0000:7d:00.2: In reset process RoCE client uninit.
[51773.734759] hns3 0000:7d:00.0: In reset process RoCE client uninit.
[51773.734892] hns3 0000:7d:00.0 roceo1: failed to post WQE, dev state 1!
[51773.742915] hns3 0000:7d:00.0 roceo1: failed to post wqe for free mr, ret = -5.
[51773.751696] hns3 0000:7d:00.0 roceo1: failed to send wqe (qp:0xa) for free mr, ret = -5.
[51773.761269] hns3 0000:7d:00.0 roceo1: failed to post WQE, dev state 1!
[51773.769283] hns3 0000:7d:00.0 roceo1: failed to post wqe for free mr, ret = -5.
[51773.778071] hns3 0000:7d:00.0 roceo1: failed to send wqe (qp:0xa) for free mr, ret = -5.
[51773.787652] hns3 0000:7d:00.0 roceo1: failed to post WQE, dev state 1!
[51773.795674] hns3 0000:7d:00.0 roceo1: failed to post wqe for free mr, ret = -5.
[51773.804462] hns3 0000:7d:00.0 roceo1: failed to send wqe (qp:0xa) for free mr, ret = -5.
[51773.814050] hns3 0000:7d:00.0 roceo1: failed to post WQE, dev state 1!
[51773.822089] hns3 0000:7d:00.0 roceo1: failed to post wqe for free mr, ret = -5.
[51773.830899] hns3 0000:7d:00.0 roceo1: failed to send wqe (qp:0xa) for free mr, ret = -5.
[51773.840527] hns3 0000:7d:00.0 roceo1: failed to post WQE, dev state 1!
[51773.848650] hns3 0000:7d:00.0 roceo1: failed to post wqe for free mr, ret = -5.
[51773.857545] hns3 0000:7d:00.0 roceo1: failed to send wqe (qp:0xa) for free mr, ret = -5.
[51773.867257] hns3 0000:7d:00.0 roceo1: failed to post WQE, dev state 2!
[51773.875443] hns3 0000:7d:00.0 roceo1: failed to post wqe for free mr, ret = -5.
[51773.884378] hns3 0000:7d:00.0 roceo1: failed to send wqe (qp:0xa) for free mr, ret = -5.
[51773.894107] hns3 0000:7d:00.0 roceo1: failed to post WQE, dev state 2!
[51773.902551] hns3 0000:7d:00.0 roceo1: failed to post wqe for free mr, ret = -5.
[51773.911756] hns3 0000:7d:00.0 roceo1: failed to send wqe (qp:0xa) for free mr, ret = -5.
[51773.921715] hns3 0000:7d:00.0 roceo1: failed to post WQE, dev state 2!
[51773.930158] hns3 0000:7d:00.0 roceo1: failed to post wqe for free mr, ret = -5.
[51773.939322] hns3 0000:7d:00.0 roceo1: failed to send wqe (qp:0xa) for free mr, ret = -5.
[51773.966811] hns3 0000:7d:00.1: The firmware version is 1.20.0.17
[51774.070238] hns3 0000:7d:00.1: Reset done, hclge driver initialization finished.
[51774.087572] hns3 0000:7d:00.3: The firmware version is 1.20.0.17
[51774.150323] hns3 0000:7d:00.3: Reset done, hclge driver initialization finished.
[51774.157462] hns3 0000:7d:00.3: failed to get torus config, ret = -22
[51774.157867] hns3 0000:7d:00.2: Func clear success after reset.
[51774.158386] hns3 0000:7d:00.0: get sfp info failed -16
[51774.158432] hns3 0000:7d:00.2: get sfp info failed -16
[51774.158463] hns3 0000:7d:00.1: failed to get torus config, ret = -22
[51774.158467] hns3 0000:7d:00.1: report reset done!
[51774.158585] hns3 0000:7d:00.0: Func clear success after reset.
[51774.165159] hns3 0000:7d:00.3: report reset done!
[51774.209467] hns3 0000:7d:00.0: The firmware version is 1.20.0.17
[51774.294656] hns3 0000:7d:00.0: Reset done, hclge driver initialization finished.
[51774.300878] hns3 0000:7d:00.0: In reset process RoCE client reinit.
[51774.301746] hns3 0000:7d:00.0: invalid resource values have been adjusted, invalid_flag = 0x80.
[51774.330494] hns3 0000:7d:00.2: The firmware version is 1.20.0.17
[51774.414799] hns3 0000:7d:00.2: Reset done, hclge driver initialization finished.
[51774.420961] hns3 0000:7d:00.2: In reset process RoCE client reinit.
[51774.421858] hns3 0000:7d:00.2: invalid resource values have been adjusted, invalid_flag = 0x80.
[51774.425364] hns3 0000:7d:00.0: Reset done, RoCE client reinit finished.
[51774.426485] hns3 0000:7d:00.0: report reset done!
[51774.441314] debugfs: Directory 'hns_0' with parent 'hns_roce' already present!
[51774.451154] hns3 0000:7d:00.2: Reset done, RoCE client reinit finished.
[51774.452208] hns3 0000:7d:00.2: report reset done!
[51775.816292] hns3 0000:7d:00.3 eno4: link up
[51775.830912] hns3 0000:7d:00.1 eno2: link up
[51776.207123] hns3 0000:7d:00.0 eno1: link up
[51778.100933] hns3 0000:7d:00.2 eno3: link up
[root@localhost ~]#
登录 后才可以发表评论