The investigation done so far regarding the dead lock shows that 2 process entered a dead
lock, a udp worker and the timer process.
The timer process was trying to delete a dialog reference, and got the dialog lock
(d_table->entry lock) and then tried to get the DMQ node list lock.
On the other hand, the UDP worker was trying to broadcast a DMQ message, got the DMQ node
list lock and then tried to get the d_table entry lock for the same dialog as the timer
process.
```
N 21068 main process - attendant
Y 21247 udp receiver child=0 sock=124.47.168.242:5060 TRYING TO GET LOCK
0x7f03675bdc48
Y 21248 udp receiver child=1 sock=124.47.168.242:5060 TRYING TO GET LOCK
0x7f03675bdc48
Y 21249 udp receiver child=2 sock=124.47.168.242:5060 TRYING TO GET LOCK
0x7f03675bdc48
Y 21250 udp receiver child=3 sock=124.47.168.242:5060 TRYING TO GET LOCK
0x7f03675bdc48
Y 21251 udp receiver child=4 sock=124.47.168.242:5060 TRYING TO GET LOCK
0x7f03675bdc48
Y 21252 udp receiver child=5 sock=124.47.168.242:5060 TRYING TO GET LOCK
0x7f03675bdc48
Y 21253 udp receiver child=6 sock=124.47.168.242:5060 TRYING TO GET LOCK
0x7f036822441c HAS LOCK 0x7f03675bdc48
Y 21254 udp receiver child=7 sock=124.47.168.242:5060
Y 21255 udp receiver child=0 sock=10.51.1.72:5060
Y 21257 udp receiver child=1 sock=10.51.1.72:5060
Y 21258 udp receiver child=2 sock=10.51.1.72:5060
Y 21260 udp receiver child=3 sock=10.51.1.72:5060
Y 21261 udp receiver child=4 sock=10.51.1.72:5060
Y 21262 udp receiver child=5 sock=10.51.1.72:5060
Y 21263 udp receiver child=6 sock=10.51.1.72:5060
Y 21264 udp receiver child=7 sock=10.51.1.72:5060
N 21265 slow timer
Y 21266 timer TRYING TO GET LOCK 0x7f03675bdc48 HAS LOCK 0x7f036822441c
N 21267 secondary timer
N 21268 ctl handler
N 21269 JSONRPCS FIFO
Y 21270 DMQ WORKER TRYING TO GET LOCK 0x7f03675bdcd4
Y 21271 DMQ WORKER TRYING TO GET LOCK 0x7f03675bdcec
Y 21273 Dialog KA Timer
Y 21275 Dialog Clean Timer TRYING TO GET LOCK 0x7f036822441c
N 21278 WEBSOCKET KEEPALIVE
N 21279 WEBSOCKET TIMER
N 21281 TIMER NH
N 21282 TIMER NH
Y 21283 TIMER UAC REG
N 21284 tcp receiver (generic) child=0
N 21285 tcp receiver (generic) child=1
N 21286 tcp receiver (generic) child=2
N 21287 tcp receiver (generic) child=3
N 21289 tcp receiver (generic) child=4
N 21291 tcp receiver (generic) child=5
N 21292 tcp receiver (generic) child=6
N 21293 tcp receiver (generic) child=7
N 21294 tcp main process
```
21266 # timer BT:
```
#0 0x00007f0386d30bf9 in syscall () from /lib64/libc.so.6
#1 0x00007f035d4effa9 in futex_get (lock=0x7f03675bdc48) at ../../core/futexlock.h:121
<- TRYING TO GET LOCK 0x7f03675bdc48
#2 0x00007f035d4f205b in bcast_dmq_message1 (peer=0x7f0368231ff0, body=0x7ffe95bc0920,
except=0x0, resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1,
content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>, incl_inactive=0)
at dmq_funcs.c:156
#3 0x00007f035d4f28d4 in bcast_dmq_message (peer=0x7f0368231ff0, body=0x7ffe95bc0920,
except=0x0, resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1,
content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>) at dmq_funcs.c:188
#4 0x00007f035c96704d in dlg_dmq_send (body=0x7ffe95bc0920, node=0x0) at dlg_dmq.c:88
#5 0x00007f035c9709d5 in dlg_dmq_replicate_action (action=DLG_DMQ_RM, dlg=0x7f03684b77f8,
needlock=0, node=0x0) at dlg_dmq.c:628
#6 0x00007f035c90a39a in destroy_dlg (dlg=0x7f03684b77f8) at dlg_hash.c:367
#7 0x00007f035c914064 in dlg_unref_helper (dlg=0x7f03684b77f8, cnt=2,
fname=0x7f035c9b222d "dlg_handlers.c", fline=1212) at dlg_hash.c:1081 <<
--- GOT LOCK 0x7f036822441c
(gdb) p *d_entry
$2 = {first = 0x7f0369205248, last = 0x7f036855c680, next_id = 30442, lock = {val = 2},
locker_pid = {val = 21266}, rec_lock_level = 0}
(gdb) p dlg->h_entry
$3 = 2340
(gdb) p &d_entry->lock
$5 = (gen_lock_t *) 0x7f036822441c
#8 0x00007f035c945bd2 in unref_dlg_from_cb (t=0x7f036936b678, type=131072,
param=0x7ffe95bc0e20) at dlg_handlers.c:1212
#9 0x00007f03612d3165 in run_trans_callbacks_internal (cb_lst=0x7f036936b6f0,
type=131072, trans=0x7f036936b678, params=0x7ffe95bc0e20) at t_hooks.c:258
#10 0x00007f03612d3297 in run_trans_callbacks (type=131072, trans=0x7f036936b678, req=0x0,
rpl=0x0, code=0) at t_hooks.c:285
#11 0x00007f03611f9b8f in free_cell_helper (dead_cell=0x7f036936b678, silent=0,
fname=0x7f0361313363 "timer.c", fline=643) at h_table.c:165
#12 0x00007f03612bfc4d in wait_handler (ti=1856451739, wait_tl=0x7f036936b700,
data=0x7f036936b678) at timer.c:643
#13 0x00000000004ea094 in timer_list_expire (t=1856451739, h=0x7f036587d380,
slow_l=0x7f036587f948, slow_mark=5689) at core/timer.c:857
#14 0x00000000004ea53d in timer_handler () at core/timer.c:922
#15 0x00000000004ea9e7 in timer_main () at core/timer.c:961
#16 0x000000000042a838 in main_loop () at main.c:1753
#17 0x0000000000433a96 in main (argc=10, argv=0x7ffe95bc1968) at main.c:2856
```
21253 # udp receiver child=6 sock=124.47.168.242:5060 BT:
```
#0 0x00007f0386d30bf9 in syscall () from /lib64/libc.so.6
#1 0x00007f035c9061a7 in futex_get (lock=0x7f036822441c) at ../../core/futexlock.h:121
#2 0x00007f035c90ee00 in dlg_lookup_mode (h_entry=2340, h_id=30422, lmode=0) at
dlg_hash.c:781 <- TRYING TO GET LOCK KHERE
(gdb) p *d_entry
$1 = {first = 0x7f0369205248, last = 0x7f036855c680, next_id = 30442, lock = {val = 2},
locker_pid = {val = 21266},
rec_lock_level = 0}
(gdb) p h_entry
$2 = 2340
#3 0x00007f035c90fa27 in dlg_get_by_iuid (diuid=0x7f035cbcbd40 <_dlg_ctx+64>) at
dlg_hash.c:849
#4 0x00007f035c9abfd1 in dlg_get_ctx_dialog () at dlg_var.c:940
#5 0x00007f035c945c03 in dlg_lookup_msg_dialog (msg=0x7ffe95bbc8c0, dir=0x7ffe95bbc8bc)
at dlg_handlers.c:1227
#6 0x00007f035c956f77 in dlg_cseq_msg_sent (evp=0x7ffe95bbd2d0) at dlg_cseq.c:393
#7 0x00000000004ec14b in sr_event_exec (type=2, evp=0x7ffe95bbd2d0) at
core/events.c:240
#8 0x00007f03612c1e88 in msg_send_buffer (dst=0x7f036a0a4870,
buf=0x7f036bfa1af8 "KDMQ sip:dialog@10.51.1.71:5060 SIP/2.0\r\nVia: SIP/2.0/UDP
10.51.1.72;branch=z9hG4bK6492.a407a601", '0' <repeats 24 times>,
".0\r\nTo: <sip:dialog@10.51.1.71:5060>\r\nFrom:
<sip:dialog@10.51.1.72:5060>;tag=cf4"..., len=703, flags=0) at
../../core/forward.h:148
#9 0x00007f03612c743e in send_pr_buffer (rb=0x7f036a0a4820, buf=0x7f036bfa1af8, len=703)
at t_funcs.c:69
#10 0x00007f036128d315 in send_prepared_request_impl (request=0x7f036a0a4820,
retransmit=1, branch=0) at uac.c:669
#11 0x00007f036128e89e in t_uac_with_ids (uac_r=0x7ffe95bbd8f0, ret_index=0x0,
ret_label=0x0) at uac.c:753
#12 0x00007f036128e7ca in t_uac (uac_r=0x7ffe95bbd8f0) at uac.c:721
#13 0x00007f0361291a62 in request (uac_r=0x7ffe95bbd8f0, ruri=0x7ffe95bbd8c0,
to=0x7ffe95bbd8c0,
from=0x7ffe95bbd8d0, next_hop=0x0) at uac.c:1089
#14 0x00007f035d4f4053 in dmq_send_message (peer=0x7f0368231ff0, body=0x7ffe95bbdca0,
node=0x7f0368464068,
resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1,
content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>) at dmq_funcs.c:251
#15 0x00007f035d4f24ff in bcast_dmq_message1 (peer=0x7f0368231ff0, body=0x7ffe95bbdca0,
except=0x0,
resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1,
content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>, incl_inactive=0) at
dmq_funcs.c:170 <----- GOT LOCK 0x7f03675bdc48
(gdb) p &dmq_node_list->lock
$1 = (gen_lock_t *) 0x7f03675bdc48
#16 0x00007f035d4f28d4 in bcast_dmq_message (peer=0x7f0368231ff0, body=0x7ffe95bbdca0,
except=0x0,
resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1,
content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>) at dmq_funcs.c:188
#17 0x00007f035c96704d in dlg_dmq_send (body=0x7ffe95bbdca0, node=0x0) at dlg_dmq.c:88
#18 0x00007f035c9709d5 in dlg_dmq_replicate_action (action=DLG_DMQ_STATE,
dlg=0x7f0369205248, needlock=0, node=0x0)
at dlg_dmq.c:628
#19 0x00007f035c94ba78 in dlg_onroute (req=0x7f03860006e0, route_params=0x7ffe95bbe030,
param=0x0)
at dlg_handlers.c:1559
#20 0x00007f0360b665c6 in run_rr_callbacks (req=0x7f03860006e0, rr_param=0x7ffe95bbe120)
at rr_cb.c:96
#21 0x00007f0360b799b2 in after_loose (_m=0x7f03860006e0, preloaded=0) at loose.c:984
#22 0x00007f0360b7a0dc in loose_route (_m=0x7f03860006e0) at loose.c:1018
#23 0x00007f0360b7e240 in w_loose_route (msg=0x7f03860006e0, p1=0x0, p2=0x0) at
rr_mod.c:276
#24 0x00000000005d0bc5 in do_action (h=0x7ffe95bbecd0, a=0x7f0385daa880,
msg=0x7f03860006e0) at core/action.c:1076
#25 0x00000000005ddf78 in run_actions (h=0x7ffe95bbecd0, a=0x7f0385daa880,
msg=0x7f03860006e0) at core/action.c:1581
#26 0x00000000005de5f4 in run_actions_safe (h=0x7ffe95bc09f0, a=0x7f0385daa880,
msg=0x7f03860006e0)
at core/action.c:1645
#27 0x0000000000446834 in rval_get_int (h=0x7ffe95bc09f0, msg=0x7f03860006e0,
i=0x7ffe95bbf5fc, rv=0x7f0385daaac0,
cache=0x0) at core/rvalue.c:915
#28 0x000000000044b365 in rval_expr_eval_int (h=0x7ffe95bc09f0, msg=0x7f03860006e0,
res=0x7ffe95bbf5fc,
rve=0x7f0385daaab8) at core/rvalue.c:1913
#29 0x00000000005d06a6 in do_action (h=0x7ffe95bc09f0, a=0x7f0385db3280,
msg=0x7f03860006e0) at core/action.c:1052
#30 0x00000000005ddf78 in run_actions (h=0x7ffe95bc09f0, a=0x7f0385da9ee8,
msg=0x7f03860006e0) at core/action.c:1581
#31 0x00000000005cd315 in do_action (h=0x7ffe95bc09f0, a=0x7f0385d24ea0,
msg=0x7f03860006e0) at core/action.c:700
#32 0x00000000005ddf78 in run_actions (h=0x7ffe95bc09f0, a=0x7f0385d24c68,
msg=0x7f03860006e0) at core/action.c:1581
#33 0x00000000005d0b34 in do_action (h=0x7ffe95bc09f0, a=0x7f0385d3e420,
msg=0x7f03860006e0) at core/action.c:1067
#34 0x00000000005ddf78 in run_actions (h=0x7ffe95bc09f0, a=0x7f0385d1ae68,
msg=0x7f03860006e0) at core/action.c:1581
#35 0x00000000005de6bc in run_top_route (a=0x7f0385d1ae68, msg=0x7f03860006e0, c=0x0) at
core/action.c:1666
#36 0x00000000005e5c40 in receive_msg (
buf=0xabcb80 <buf.7133> "ACK
sip:61283214906@10.51.1.50:5060;alias=10.51.1.50~5060~1 SIP/2.0\r\nVia: SIP/2.0/UDP
124.47.168.253:6080;branch=z9hG4bK-18807-894-4\r\nFrom: test_894
<sip:61289993111@124.47.168.253:6080>;tag=18807SIPp"..., len=724,
rcv_info=0x7ffe95bc10e0) at core/receive.c:423
#37 0x00000000004c1442 in udp_rcv_loop () at core/udp_server.c:543
#38 0x0000000000429d47 in main_loop () at main.c:1683
#39 0x0000000000433a96 in main (argc=10, argv=0x7ffe95bc1968) at main.c:2856
```
Full kamctl trap:
[
gdb_kamailio_20201102_110450.txt](https://github.com/kamailio/kamailio/file…
--
You are receiving this because you are subscribed to this thread.
Reply to this email directly or view it on GitHub:
https://github.com/kamailio/kamailio/issues/2535#issuecomment-720814149