The investigation done so far regarding the dead lock shows that 2 process entered a dead lock, a udp worker and the timer process. The timer process was trying to delete a dialog reference, and got the dialog lock (d_table->entry lock) and then tried to get the DMQ node list lock. On the other hand, the UDP worker was trying to broadcast a DMQ message, got the DMQ node list lock and then tried to get the d_table entry lock for the same dialog as the timer process. ``` N 21068 main process - attendant Y 21247 udp receiver child=0 sock=124.47.168.242:5060 TRYING TO GET LOCK 0x7f03675bdc48 Y 21248 udp receiver child=1 sock=124.47.168.242:5060 TRYING TO GET LOCK 0x7f03675bdc48 Y 21249 udp receiver child=2 sock=124.47.168.242:5060 TRYING TO GET LOCK 0x7f03675bdc48 Y 21250 udp receiver child=3 sock=124.47.168.242:5060 TRYING TO GET LOCK 0x7f03675bdc48 Y 21251 udp receiver child=4 sock=124.47.168.242:5060 TRYING TO GET LOCK 0x7f03675bdc48 Y 21252 udp receiver child=5 sock=124.47.168.242:5060 TRYING TO GET LOCK 0x7f03675bdc48 Y 21253 udp receiver child=6 sock=124.47.168.242:5060 TRYING TO GET LOCK 0x7f036822441c HAS LOCK 0x7f03675bdc48 Y 21254 udp receiver child=7 sock=124.47.168.242:5060 Y 21255 udp receiver child=0 sock=10.51.1.72:5060 Y 21257 udp receiver child=1 sock=10.51.1.72:5060 Y 21258 udp receiver child=2 sock=10.51.1.72:5060 Y 21260 udp receiver child=3 sock=10.51.1.72:5060 Y 21261 udp receiver child=4 sock=10.51.1.72:5060 Y 21262 udp receiver child=5 sock=10.51.1.72:5060 Y 21263 udp receiver child=6 sock=10.51.1.72:5060 Y 21264 udp receiver child=7 sock=10.51.1.72:5060 N 21265 slow timer Y 21266 timer TRYING TO GET LOCK 0x7f03675bdc48 HAS LOCK 0x7f036822441c N 21267 secondary timer N 21268 ctl handler N 21269 JSONRPCS FIFO Y 21270 DMQ WORKER TRYING TO GET LOCK 0x7f03675bdcd4 Y 21271 DMQ WORKER TRYING TO GET LOCK 0x7f03675bdcec Y 21273 Dialog KA Timer Y 21275 Dialog Clean Timer TRYING TO GET LOCK 0x7f036822441c N 21278 WEBSOCKET KEEPALIVE N 21279 WEBSOCKET TIMER N 21281 TIMER NH N 21282 TIMER NH Y 21283 TIMER UAC REG N 21284 tcp receiver (generic) child=0 N 21285 tcp receiver (generic) child=1 N 21286 tcp receiver (generic) child=2 N 21287 tcp receiver (generic) child=3 N 21289 tcp receiver (generic) child=4 N 21291 tcp receiver (generic) child=5 N 21292 tcp receiver (generic) child=6 N 21293 tcp receiver (generic) child=7 N 21294 tcp main process ``` 21266 # timer BT: ``` #0 0x00007f0386d30bf9 in syscall () from /lib64/libc.so.6 #1 0x00007f035d4effa9 in futex_get (lock=0x7f03675bdc48) at ../../core/futexlock.h:121 <- TRYING TO GET LOCK 0x7f03675bdc48 #2 0x00007f035d4f205b in bcast_dmq_message1 (peer=0x7f0368231ff0, body=0x7ffe95bc0920, except=0x0, resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>, incl_inactive=0) at dmq_funcs.c:156 #3 0x00007f035d4f28d4 in bcast_dmq_message (peer=0x7f0368231ff0, body=0x7ffe95bc0920, except=0x0, resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>) at dmq_funcs.c:188 #4 0x00007f035c96704d in dlg_dmq_send (body=0x7ffe95bc0920, node=0x0) at dlg_dmq.c:88 #5 0x00007f035c9709d5 in dlg_dmq_replicate_action (action=DLG_DMQ_RM, dlg=0x7f03684b77f8, needlock=0, node=0x0) at dlg_dmq.c:628 #6 0x00007f035c90a39a in destroy_dlg (dlg=0x7f03684b77f8) at dlg_hash.c:367 #7 0x00007f035c914064 in dlg_unref_helper (dlg=0x7f03684b77f8, cnt=2, fname=0x7f035c9b222d "dlg_handlers.c", fline=1212) at dlg_hash.c:1081 << --- GOT LOCK 0x7f036822441c
(gdb) p *d_entry $2 = {first = 0x7f0369205248, last = 0x7f036855c680, next_id = 30442, lock = {val = 2}, locker_pid = {val = 21266}, rec_lock_level = 0} (gdb) p dlg->h_entry $3 = 2340 (gdb) p &d_entry->lock $5 = (gen_lock_t *) 0x7f036822441c
#8 0x00007f035c945bd2 in unref_dlg_from_cb (t=0x7f036936b678, type=131072, param=0x7ffe95bc0e20) at dlg_handlers.c:1212 #9 0x00007f03612d3165 in run_trans_callbacks_internal (cb_lst=0x7f036936b6f0, type=131072, trans=0x7f036936b678, params=0x7ffe95bc0e20) at t_hooks.c:258 #10 0x00007f03612d3297 in run_trans_callbacks (type=131072, trans=0x7f036936b678, req=0x0, rpl=0x0, code=0) at t_hooks.c:285 #11 0x00007f03611f9b8f in free_cell_helper (dead_cell=0x7f036936b678, silent=0, fname=0x7f0361313363 "timer.c", fline=643) at h_table.c:165 #12 0x00007f03612bfc4d in wait_handler (ti=1856451739, wait_tl=0x7f036936b700, data=0x7f036936b678) at timer.c:643 #13 0x00000000004ea094 in timer_list_expire (t=1856451739, h=0x7f036587d380, slow_l=0x7f036587f948, slow_mark=5689) at core/timer.c:857 #14 0x00000000004ea53d in timer_handler () at core/timer.c:922 #15 0x00000000004ea9e7 in timer_main () at core/timer.c:961 #16 0x000000000042a838 in main_loop () at main.c:1753 #17 0x0000000000433a96 in main (argc=10, argv=0x7ffe95bc1968) at main.c:2856
``` 21253 # udp receiver child=6 sock=124.47.168.242:5060 BT: ``` #0 0x00007f0386d30bf9 in syscall () from /lib64/libc.so.6 #1 0x00007f035c9061a7 in futex_get (lock=0x7f036822441c) at ../../core/futexlock.h:121 #2 0x00007f035c90ee00 in dlg_lookup_mode (h_entry=2340, h_id=30422, lmode=0) at dlg_hash.c:781 <- TRYING TO GET LOCK KHERE
(gdb) p *d_entry $1 = {first = 0x7f0369205248, last = 0x7f036855c680, next_id = 30442, lock = {val = 2}, locker_pid = {val = 21266}, rec_lock_level = 0}
(gdb) p h_entry $2 = 2340
#3 0x00007f035c90fa27 in dlg_get_by_iuid (diuid=0x7f035cbcbd40 <_dlg_ctx+64>) at dlg_hash.c:849 #4 0x00007f035c9abfd1 in dlg_get_ctx_dialog () at dlg_var.c:940 #5 0x00007f035c945c03 in dlg_lookup_msg_dialog (msg=0x7ffe95bbc8c0, dir=0x7ffe95bbc8bc) at dlg_handlers.c:1227 #6 0x00007f035c956f77 in dlg_cseq_msg_sent (evp=0x7ffe95bbd2d0) at dlg_cseq.c:393 #7 0x00000000004ec14b in sr_event_exec (type=2, evp=0x7ffe95bbd2d0) at core/events.c:240 #8 0x00007f03612c1e88 in msg_send_buffer (dst=0x7f036a0a4870, buf=0x7f036bfa1af8 "KDMQ sip:dialog@10.51.1.71:5060 SIP/2.0\r\nVia: SIP/2.0/UDP 10.51.1.72;branch=z9hG4bK6492.a407a601", '0' <repeats 24 times>, ".0\r\nTo: sip:dialog@10.51.1.71:5060\r\nFrom: sip:dialog@10.51.1.72:5060;tag=cf4"..., len=703, flags=0) at ../../core/forward.h:148 #9 0x00007f03612c743e in send_pr_buffer (rb=0x7f036a0a4820, buf=0x7f036bfa1af8, len=703) at t_funcs.c:69 #10 0x00007f036128d315 in send_prepared_request_impl (request=0x7f036a0a4820, retransmit=1, branch=0) at uac.c:669 #11 0x00007f036128e89e in t_uac_with_ids (uac_r=0x7ffe95bbd8f0, ret_index=0x0, ret_label=0x0) at uac.c:753 #12 0x00007f036128e7ca in t_uac (uac_r=0x7ffe95bbd8f0) at uac.c:721 #13 0x00007f0361291a62 in request (uac_r=0x7ffe95bbd8f0, ruri=0x7ffe95bbd8c0, to=0x7ffe95bbd8c0, from=0x7ffe95bbd8d0, next_hop=0x0) at uac.c:1089 #14 0x00007f035d4f4053 in dmq_send_message (peer=0x7f0368231ff0, body=0x7ffe95bbdca0, node=0x7f0368464068, resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>) at dmq_funcs.c:251 #15 0x00007f035d4f24ff in bcast_dmq_message1 (peer=0x7f0368231ff0, body=0x7ffe95bbdca0, except=0x0, resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>, incl_inactive=0) at dmq_funcs.c:170 <----- GOT LOCK 0x7f03675bdc48
(gdb) p &dmq_node_list->lock $1 = (gen_lock_t *) 0x7f03675bdc48
#16 0x00007f035d4f28d4 in bcast_dmq_message (peer=0x7f0368231ff0, body=0x7ffe95bbdca0, except=0x0, resp_cback=0x7f035cbc9af0 <dlg_dmq_resp_callback>, max_forwards=1, content_type=0x7f035cbc9ab0 <dlg_dmq_content_type>) at dmq_funcs.c:188 #17 0x00007f035c96704d in dlg_dmq_send (body=0x7ffe95bbdca0, node=0x0) at dlg_dmq.c:88 #18 0x00007f035c9709d5 in dlg_dmq_replicate_action (action=DLG_DMQ_STATE, dlg=0x7f0369205248, needlock=0, node=0x0) at dlg_dmq.c:628 #19 0x00007f035c94ba78 in dlg_onroute (req=0x7f03860006e0, route_params=0x7ffe95bbe030, param=0x0) at dlg_handlers.c:1559 #20 0x00007f0360b665c6 in run_rr_callbacks (req=0x7f03860006e0, rr_param=0x7ffe95bbe120) at rr_cb.c:96 #21 0x00007f0360b799b2 in after_loose (_m=0x7f03860006e0, preloaded=0) at loose.c:984 #22 0x00007f0360b7a0dc in loose_route (_m=0x7f03860006e0) at loose.c:1018 #23 0x00007f0360b7e240 in w_loose_route (msg=0x7f03860006e0, p1=0x0, p2=0x0) at rr_mod.c:276 #24 0x00000000005d0bc5 in do_action (h=0x7ffe95bbecd0, a=0x7f0385daa880, msg=0x7f03860006e0) at core/action.c:1076 #25 0x00000000005ddf78 in run_actions (h=0x7ffe95bbecd0, a=0x7f0385daa880, msg=0x7f03860006e0) at core/action.c:1581 #26 0x00000000005de5f4 in run_actions_safe (h=0x7ffe95bc09f0, a=0x7f0385daa880, msg=0x7f03860006e0) at core/action.c:1645 #27 0x0000000000446834 in rval_get_int (h=0x7ffe95bc09f0, msg=0x7f03860006e0, i=0x7ffe95bbf5fc, rv=0x7f0385daaac0, cache=0x0) at core/rvalue.c:915 #28 0x000000000044b365 in rval_expr_eval_int (h=0x7ffe95bc09f0, msg=0x7f03860006e0, res=0x7ffe95bbf5fc, rve=0x7f0385daaab8) at core/rvalue.c:1913 #29 0x00000000005d06a6 in do_action (h=0x7ffe95bc09f0, a=0x7f0385db3280, msg=0x7f03860006e0) at core/action.c:1052 #30 0x00000000005ddf78 in run_actions (h=0x7ffe95bc09f0, a=0x7f0385da9ee8, msg=0x7f03860006e0) at core/action.c:1581 #31 0x00000000005cd315 in do_action (h=0x7ffe95bc09f0, a=0x7f0385d24ea0, msg=0x7f03860006e0) at core/action.c:700 #32 0x00000000005ddf78 in run_actions (h=0x7ffe95bc09f0, a=0x7f0385d24c68, msg=0x7f03860006e0) at core/action.c:1581 #33 0x00000000005d0b34 in do_action (h=0x7ffe95bc09f0, a=0x7f0385d3e420, msg=0x7f03860006e0) at core/action.c:1067 #34 0x00000000005ddf78 in run_actions (h=0x7ffe95bc09f0, a=0x7f0385d1ae68, msg=0x7f03860006e0) at core/action.c:1581 #35 0x00000000005de6bc in run_top_route (a=0x7f0385d1ae68, msg=0x7f03860006e0, c=0x0) at core/action.c:1666 #36 0x00000000005e5c40 in receive_msg ( buf=0xabcb80 <buf.7133> "ACK sip:61283214906@10.51.1.50:5060;alias=10.51.1.50~5060~1 SIP/2.0\r\nVia: SIP/2.0/UDP 124.47.168.253:6080;branch=z9hG4bK-18807-894-4\r\nFrom: test_894 sip:61289993111@124.47.168.253:6080;tag=18807SIPp"..., len=724, rcv_info=0x7ffe95bc10e0) at core/receive.c:423 #37 0x00000000004c1442 in udp_rcv_loop () at core/udp_server.c:543 #38 0x0000000000429d47 in main_loop () at main.c:1683 #39 0x0000000000433a96 in main (argc=10, argv=0x7ffe95bc1968) at main.c:2856 ```
Full kamctl trap: [gdb_kamailio_20201102_110450.txt](https://github.com/kamailio/kamailio/files/5478787/gdb_kamailio_20201102_110...)