Segfault during initialization of USRP
Issue Description
This is a segmentation fault reported from CI loops (https://jenkins-oai.eurecom.fr/job/RAN-SA-2x2-Module-CN5G/817/). I am in possession of the coredump and docker image that was running the software. The gNB segfaults inside USRP lib during initialization:
(gdb) bt
#0 0x00007f0e7f6236fd in uhd::rfnoc::chdr::mgmt_payload::deserialize(unsigned long const*, unsigned long, std::function<unsigned long (unsigned long)> const&) ()
from /usr/local/lib64/libuhd.so.4.4.0
#1 0x00007f0e7f690b34 in uhd::rfnoc::mgmt::mgmt_portal_impl::_send_recv_mgmt_transaction(uhd::rfnoc::chdr_ctrl_xport&, uhd::rfnoc::chdr::mgmt_payload const&, double) [clone .constprop.0] () from /usr/local/lib64/libuhd.so.4.4.0
#2 0x00007f0e7f695c1d in uhd::rfnoc::mgmt::mgmt_portal_impl::_get_ostrm_status(uhd::rfnoc::chdr_ctrl_xport&, uhd::rfnoc::detail::topo_node_t const&) ()
from /usr/local/lib64/libuhd.so.4.4.0
#3 0x00007f0e7f69874b in uhd::rfnoc::mgmt::mgmt_portal_impl::config_local_rx_stream_commit(uhd::rfnoc::chdr_ctrl_xport&, unsigned short const&, double, bool) ()
from /usr/local/lib64/libuhd.so.4.4.0
#4 0x00007f0e7f62c99a in uhd::rfnoc::chdr_rx_data_xport::configure_sep(std::shared_ptr<uhd::transport::io_service>, std::shared_ptr<uhd::transport::recv_link_if>, std::shared_ptr<uhd::transport::send_link_if>, uhd::rfnoc::chdr::chdr_packet_factory const&, uhd::rfnoc::mgmt::mgmt_portal&, std::pair<unsigned short, unsigned short> const&, uhd::rfnoc::sw_buff_t, uhd::rfnoc::sw_buff_t, uhd::rfnoc::stream_buff_params_t const&, uhd::rfnoc::stream_buff_params_t const&, uhd::rfnoc::stream_buff_params_t const&, bool, std::function<void ()>) ()
from /usr/local/lib64/libuhd.so.4.4.0
#5 0x00007f0e7fb06a5d in uhd::mpmd::mpmd_mboard_impl::mpmd_mb_iface::make_rx_data_transport(uhd::rfnoc::mgmt::mgmt_portal&, std::pair<std::pair<unsigned short, unsigned short>, std::pair<unsigned short, unsigned short> > const&, std::pair<unsigned short, unsigned short> const&, uhd::rfnoc::sw_buff_t, uhd::rfnoc::sw_buff_t, uhd::device_addr_t const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) () from /usr/local/lib64/libuhd.so.4.4.0
#6 0x00007f0e7f649116 in link_stream_manager_impl::create_device_to_host_data_stream(std::pair<unsigned short, unsigned short>, uhd::rfnoc::sw_buff_t, uhd::rfnoc::sw_buff_t, uhd::device_addr_t const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
--Type <RET> for more, q to quit, c to continue without paging--
from /usr/local/lib64/libuhd.so.4.4.0
#7 0x00007f0e7f64c748 in graph_stream_manager_impl::create_device_to_host_data_stream(std::pair<unsigned short, unsigned short>, uhd::rfnoc::sw_buff_t, uhd::rfnoc::sw_buff_t, unsigned long, uhd::device_addr_t const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
from /usr/local/lib64/libuhd.so.4.4.0
#8 0x00007f0e7f6841a5 in rfnoc_graph_impl::connect(uhd::rfnoc::block_id_t const&, unsigned long, std::shared_ptr<uhd::rx_streamer>, unsigned long, unsigned long)
() from /usr/local/lib64/libuhd.so.4.4.0
#9 0x00007f0e7f7a862a in multi_usrp_rfnoc::get_rx_stream(uhd::stream_args_t const&) () from /usr/local/lib64/libuhd.so.4.4.0
#10 0x00007f0e99460816 in device_init (device=0x1a956ba0,
openair0_cfg=0x1a9577b8) at /oai-ran/radio/USRP/usrp_lib.cpp:1478
#11 0x00000000006fdb1a in load_lib (device=device@entry=0x1a956ba0,
openair0_cfg=openair0_cfg@entry=0x1a9577b8, cfg=cfg@entry=0x0,
flag=flag@entry=0 '\000') at /oai-ran/radio/COMMON/common_lib.c:139
#12 0x00000000006fdcec in openair0_device_load (device=device@entry=0x1a956ba0,
openair0_cfg=openair0_cfg@entry=0x1a9577b8)
at /oai-ran/radio/COMMON/common_lib.c:147
#13 0x00000000006f56a0 in ru_thread (param=0x1a956710)
at /oai-ran/executables/nr-ru.c:1209
#14 0x00007f0e9c129c02 in start_thread () from /lib64/libc.so.6
#15 0x00007f0e9c1aded4 in clone () from /lib64/libc.so.6
Investigation
I'm certain that segfault is due to reading buff in line
std::list<uint64_t> src_list(buff, buff + (buff_size * (_padding_size + 1)));
│ 0x7f0e7f6236e1 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+81> lea (%rsi,%rdx,8),%r13
│ 0x7f0e7f6236e5 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+85> cmp %rsi,%r13 │
│ 0x7f0e7f6236e8 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+88> je 0x7f0e7f62371c <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+140> │
│ 0x7f0e7f6236ea <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+90> nopw 0x0(%rax,%rax,1) │
│ 0x7f0e7f6236f0 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+96> mov $0x18,%edi │
│ 0x7f0e7f6236f5 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+101> call 0x7f0e7f326810 <_Znwm@plt> │
│ 0x7f0e7f6236fa <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+106> mov %rax,%rdi │
│ >0x7f0e7f6236fd <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+109> mov 0x0(%rbp),%rax │
│ 0x7f0e7f623701 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+113> mov %r12,%rsi │
│ 0x7f0e7f623704 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+116> add $0x8,%rbp │
│ 0x7f0e7f623708 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+120> mov %rax,0x10(%rdi) │
│ 0x7f0e7f62370c <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+124> call 0x7f0e7f326cd0 <_ZNSt8__detail15_List_node_base7_M_hookEPS0_@plt> │
│ 0x7f0e7f623711 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+129> addq $0x1,0x70(%rsp) │
│ 0x7f0e7f623717 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+135> cmp %rbp,%r13 │
│ 0x7f0e7f62371a <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+138> jne 0x7f0e7f6236f0 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+96>
Comments: buff
+ offset dereference (segfault cause)
mov 0x0(%rbp),%rax
This is the end of the loop generated by std::list
constructor
cmp %rbp,%r13
jne 0x7f0e7f6236f0
r13
holds the address 8 bytes past last element, calculated here:
lea (%rsi,%rdx,8),%r13
This corresponds to
buff + (buff_size * (_padding_size + 1))
rdx
is already equal to buff_size * (_padding_size + 1)
The segfault happens because the loop end condition is invalid:
(gdb) p $r13
$17 = 139700656659168
(gdb) p (uint64_t)$rbp
$18 = 139700658503680
r13
is lower than rbp
, so the calculated end loop condition is before the current iterator.
I've tried to access this_p (mgmt_payload
) and I think it is saved in rbx
│ 0x7f0e7f6236b3 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+35> mov 0x8(%rdi),%rdx // This is loading padding_size for the loop end condition calculation │
│ 0x7f0e7f6236b7 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+39> lea 0x60(%rsp),%r12 │
│ 0x7f0e7f6236bc <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+44> mov %rdi,%rbx // this pointer in rbx?
p *(size_t*)($rbx+8)
$10 = 0
(gdb) x/40x $rbx
0x7f0e90bfcad0: 0x0003 0x0100 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
This is the mgmt_payload
type (https://github.com/EttusResearch/uhd/blob/a5ed1872be6d0fc36de9a7e0b508933da1f119bc/host/include/uhd/rfnoc/chdr_types.hpp#L953)
Looking for buffer_size (to complete loop end condition calculation): from frame 1:
│ 0x7f0e7f690b29 <_ZN3uhd5rfnoc4mgmt16mgmt_portal_impl27_send_recv_mgmt_transactionERNS0_15chdr_ctrl_xportERKNS0_4chdr12mgmt_payloadEd.constprop.0+729> mov %rbp,%rdx │
│ 0x7f0e7f690b2c <_ZN3uhd5rfnoc4mgmt16mgmt_portal_impl27_send_recv_mgmt_transactionERNS0_15chdr_ctrl_xportERKNS0_4chdr12mgmt_payloadEd.constprop.0+732> mov %r13,%rdi │
│ 0x7f0e7f690b2f <_ZN3uhd5rfnoc4mgmt16mgmt_portal_impl27_send_recv_mgmt_transactionERNS0_15chdr_ctrl_xportERKNS0_4chdr12mgmt_payloadEd.constprop.0+735> call 0x7f0e7f3299e0 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE@plt>
rdx
set, in frame 0 it is read:
│ 0x7f0e7f6236a6 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+22> cmp $0x1,%rdx
therefore rdx
is buff_size
in frame 0, this assembly is for this line in the code:
UHD_ASSERT_THROW(buff_size > 1);
Also rbp
is buff_size
in frame 1. It is pushed on the stack here:
│ 0x7f0e7f623690 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE> push %r15 │
│ 0x7f0e7f623692 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+2> push %r14 │
│ 0x7f0e7f623694 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+4> push %r13 │
│ 0x7f0e7f623696 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+6> push %r12 │
│ 0x7f0e7f623698 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+8> push %rbp │
│ 0x7f0e7f623699 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+9> push %rbx │
│ 0x7f0e7f62369a <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+10> sub $0x208,%rsp
push pushes the register on the stack. Stack grows downwards(i.e. bigger stack -> lower address).
(gdb) p $rsp
$18 = (void *) 0x7f0e90bfc6b0
(gdb) p ($rsp+0x208)
$19 = (void *) 0x7f0e90bfc8b8
rsp + 0x208
should be the frame pointer.
0x7f0e90bfc8b8: 0x67a0 0x7409 0x7f0e 0x0000 0xffff 0xffff 0xffff 0x1fff
0x7f0e90bfc8c8: 0x6b80 0x740e 0x7f0e 0x0000 0xcad0 0x90bf 0x7f0e 0x0000
0x7f0e90bfc8d8: 0xcae0 0x90bf 0x7f0e 0x0000 0xc940 0x90bf 0x7f0e 0x0000
0x7f0e90bfc8e8: 0x0b34 0x7f69 0x7f0e 0x0000 0x5050 0x740e 0x7f0e 0x0000
0x7f0e90bfc8f8: 0x0003 0x0000 0x0000 0x0000 0x0018 0x0000 0x0000 0x0000
0x7f0e90bfc908: 0xc970 0x90bf 0x7f0e 0x0000 0x0000 0x0000 0x0000 0x0000
0x7f0e90bfc918: 0xe4f0 0x7402 0x7f0e 0x0000 0x6b80 0x740e 0x7f0e 0x0000
(0xffff 0xffff 0xffff 0x1fff)
should be rbx
- buff_size
(at address 0x7f0e90bfc8c0
) - uncertain
this must be the list size
field
│ 0x7f0e7f623711 <_ZN3uhd5rfnoc4chdr12mgmt_payload11deserializeEPKmmRKSt8functionIFmmEE+129> addq $0x1,0x70(%rsp)
(gdb) p (uint64_t_)($rsp+(0x70))
$28 = 230563
So 230563 elements were added before crash. Therefore buff
might be rbp - 230563 * 8
(gdb) p $rbp - *(uint64_t*)($rsp+(0x70)) * 8
$30 = (void *) 0x7f0e9803dae8
0x7f0e9803dae8: 0x0801 0x0024 0x0000 0x0000 0x0800 0x0028 0xffff 0x00ff
0x7f0e9803daf8: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
0x7f0e9803db08: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
0x7f0e9803db18: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
0x7f0e9803db28: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
0x7f0e9803db38: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
0x7f0e9803db48: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
0x7f0e9803db58: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
0x7f0e9803db68: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
0x7f0e9803db78: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
Looks good. It does look like a buffer pointer, but the buff_size
must be wrong. It could be -1 (cast to uint64_t) - that would explain the faulty loop end condition.
This is as much time as I am willing to put into this.
Looking at the documentation I found one remark about calling multi_usrp_rfnoc::get_rx_stream(uhd::stream_args_t const&)
(in the call stack) [source: https://files.ettus.com/manual/classuhd_1_1device.html#a0a9e36f353dcce36b4dd8d394c8813e3]
Make a new receive streamer from the streamer arguments.
Note: For RFNoC devices, there can always be only one streamer per channel. When calling get_rx_stream() a second time, the first streamer connected to this channel must be destroyed beforehand. Multiple streamers for different channels are allowed. For non-RFNoC devices, you can only have one RX streamer at a time. Be careful to destroy the old one if you want to create a new one.
How to ensure the device is in a correct state - i.e. that the "old" rx streamer is destroyed correctly?
Conculsions
- Consult authors of the driver regarding the rx streamer behavior
- Suggestion for UHD to add extra asserts on buff_size in the code. These seem to be non-time critical functionalities as commented in the code.