Description

Kamailio is configured to use the DMQ replication for dialog.
When running a load test Kamailio DMQ workers eventually becomes unresponsive and stops processing DMQ messages. As a side effect the shm memory is fully consumed then SIP processing stops happening properly and everything else stops working due to that.

Troubleshooting

Investigation showed that the DMQ workers are locked in the dmq_send_all_dlgs.
Debugging and studying the function it was identified that it is prone to data inconsistency because it uses a copy of the (d_table->entries)[index] and not a pointer to it. This gives a small window where a change in the d_table for that entry can occur and changing the lock information before the copy can actually get the lock making the process to get stuck in the lock get.
There is a need for some expert to validate this conclusion.

Reproduction

Run a simple load test making calls at a rate of ~5 cps and keep around ~2000 calls connected all the time. A higher cps seems to make it easier to reproduce the problem.
Adding network degradation to the environment makes the problem happens, but when running a tool such as SIPp for the load test, the retransmission can be forced by simply killing the SIPp instance receiving calls which will then force Kamailio to retransmit.

Debugging Data

7236 # DMQ WORKER
#0  0x00007fa7bd760bf9 in syscall () from /lib64/libc.so.6
#1  0x00007fa793395484 in futex_get (lock=0x7ffd072acaa4) at ../../core/futexlock.h:121
#2  0x00007fa7933a123b in dmq_send_all_dlgs (dmq_node=0x0) at dlg_dmq.c:657
        (gdb) p entry
        $1 = {first = 0x7fa79f8540d0, last = 0x7fa79f8540d0, next_id = 86622, lock = {val = 2}, locker_pid = {val = 7211}, rec_lock_level = 0}

        (gdb) p &entry->lock
        $2 = (gen_lock_t *) 0x7ffd072acaa4

        (gdb) p index
        $3 = 187

       (gdb) p (d_table->entries)[index]
       $1 = {first = 0x0, last = 0x0, next_id = 86652, lock = {val = 0}, locker_pid = {val = 0}, rec_lock_level = 0}

       (gdb) p entry
       $2 = {first = 0x7fa79f8540d0, last = 0x7fa79f8540d0, next_id = 86622, lock = {val = 2}, locker_pid = {val = 7211}, rec_lock_level = 0}

#3  0x00007fa79339cfa3 in dlg_dmq_handle_msg (msg=0x7fa79fca15c8, resp=0x7ffd072aceb0, node=0x7fa79ee93cb8) at dlg_dmq.c:391
#4  0x00007fa793f3bb98 in worker_loop (id=0) at worker.c:113
#5  0x00007fa793f39314 in child_init (rank=0) at dmq.c:296
#6  0x00000000005990e8 in init_mod_child (m=0x7fa7bc735d98, rank=0) at core/sr_module.c:780
#7  0x0000000000598d5d in init_mod_child (m=0x7fa7bc736b00, rank=0) at core/sr_module.c:776
#8  0x0000000000598d5d in init_mod_child (m=0x7fa7bc737038, rank=0) at core/sr_module.c:776
#9  0x0000000000598d5d in init_mod_child (m=0x7fa7bc738eb8, rank=0) at core/sr_module.c:776
#10 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73a0e0, rank=0) at core/sr_module.c:776
#11 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73ab70, rank=0) at core/sr_module.c:776
#12 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73d8e0, rank=0) at core/sr_module.c:776
#13 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73e720, rank=0) at core/sr_module.c:776
#14 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73f1b8, rank=0) at core/sr_module.c:776
#15 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73fd08, rank=0) at core/sr_module.c:776
#16 0x0000000000598d5d in init_mod_child (m=0x7fa7bc740128, rank=0) at core/sr_module.c:776
#17 0x0000000000598d5d in init_mod_child (m=0x7fa7bc7409d0, rank=0) at core/sr_module.c:776
#18 0x0000000000598d5d in init_mod_child (m=0x7fa7bc7414d8, rank=0) at core/sr_module.c:776
#19 0x0000000000598d5d in init_mod_child (m=0x7fa7bc742cd8, rank=0) at core/sr_module.c:776
#20 0x0000000000598d5d in init_mod_child (m=0x7fa7bc742fc8, rank=0) at core/sr_module.c:776
#21 0x0000000000598d5d in init_mod_child (m=0x7fa7bc744ed0, rank=0) at core/sr_module.c:776
#22 0x0000000000598d5d in init_mod_child (m=0x7fa7bc745920, rank=0) at core/sr_module.c:776
#23 0x0000000000598d5d in init_mod_child (m=0x7fa7bc748d60, rank=0) at core/sr_module.c:776
#24 0x0000000000598d5d in init_mod_child (m=0x7fa7bc7498f8, rank=0) at core/sr_module.c:776
#25 0x0000000000599887 in init_child (rank=0) at core/sr_module.c:825
#26 0x000000000042ab2d in main_loop () at main.c:1763
#27 0x0000000000433a96 in main (argc=12, argv=0x7ffd072aeb48) at main.c:2856

 
7237 # DMQ WORKER
#0  0x00007fa7bd760bf9 in syscall () from /lib64/libc.so.6
#1  0x00007fa793395484 in futex_get (lock=0x7ffd072acaa4) at ../../core/futexlock.h:121
#2  0x00007fa7933a123b in dmq_send_all_dlgs (dmq_node=0x0) at dlg_dmq.c:657
         (gdb) p entry
        $1 = {first = 0x7fa79f38bdb8, last = 0x7fa79f38bdb8, next_id = 15672, lock = {val = 2}, locker_pid = {val = 7221}, rec_lock_level = 1}

        (gdb) p &entry->lock
        $2 = (gen_lock_t *) 0x7ffd072acaa4

        (gdb) p index
        $1 = 402
 
        (gdb) p (d_table->entries)[index]
        $1 = {first = 0x0, last = 0x0, next_id = 15702, lock = {val = 0}, locker_pid = {val = 0}, rec_lock_level = 0}

        (gdb) p entry
        $4 = {first = 0x7fa79f38bdb8, last = 0x7fa79f38bdb8, next_id = 15672, lock = {val = 2}, locker_pid = {val = 7221}, rec_lock_level = 1}

        (gdb) p &(d_table->entries)[index]->lock
        $5 = (gen_lock_t *) 0x7fa79ec44e2c

        (gdb) p &entry->lock
        $6 = (gen_lock_t *) 0x7ffd072acaa4
 
#3  0x00007fa79339cfa3 in dlg_dmq_handle_msg (msg=0x7fa79f9e4d90, resp=0x7ffd072aceb0, node=0x7fa79ee93cb8) at dlg_dmq.c:391
#4  0x00007fa793f3bb98 in worker_loop (id=1) at worker.c:113
#5  0x00007fa793f39314 in child_init (rank=0) at dmq.c:296
#6  0x00000000005990e8 in init_mod_child (m=0x7fa7bc735d98, rank=0) at core/sr_module.c:780
#7  0x0000000000598d5d in init_mod_child (m=0x7fa7bc736b00, rank=0) at core/sr_module.c:776
#8  0x0000000000598d5d in init_mod_child (m=0x7fa7bc737038, rank=0) at core/sr_module.c:776
#9  0x0000000000598d5d in init_mod_child (m=0x7fa7bc738eb8, rank=0) at core/sr_module.c:776
#10 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73a0e0, rank=0) at core/sr_module.c:776
#11 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73ab70, rank=0) at core/sr_module.c:776
#12 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73d8e0, rank=0) at core/sr_module.c:776
#13 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73e720, rank=0) at core/sr_module.c:776
#14 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73f1b8, rank=0) at core/sr_module.c:776
#15 0x0000000000598d5d in init_mod_child (m=0x7fa7bc73fd08, rank=0) at core/sr_module.c:776
#16 0x0000000000598d5d in init_mod_child (m=0x7fa7bc740128, rank=0) at core/sr_module.c:776
#17 0x0000000000598d5d in init_mod_child (m=0x7fa7bc7409d0, rank=0) at core/sr_module.c:776
#18 0x0000000000598d5d in init_mod_child (m=0x7fa7bc7414d8, rank=0) at core/sr_module.c:776
#19 0x0000000000598d5d in init_mod_child (m=0x7fa7bc742cd8, rank=0) at core/sr_module.c:776
#20 0x0000000000598d5d in init_mod_child (m=0x7fa7bc742fc8, rank=0) at core/sr_module.c:776
#21 0x0000000000598d5d in init_mod_child (m=0x7fa7bc744ed0, rank=0) at core/sr_module.c:776
#22 0x0000000000598d5d in init_mod_child (m=0x7fa7bc745920, rank=0) at core/sr_module.c:776
#23 0x0000000000598d5d in init_mod_child (m=0x7fa7bc748d60, rank=0) at core/sr_module.c:776
#24 0x0000000000598d5d in init_mod_child (m=0x7fa7bc7498f8, rank=0) at core/sr_module.c:776
#25 0x0000000000599887 in init_child (rank=0) at core/sr_module.c:825
#26 0x000000000042ab2d in main_loop () at main.c:1763
#27 0x0000000000433a96 in main (argc=12, argv=0x7ffd072aeb48) at main.c:2856

 

 7211 
(gdb) bt
#0  0x00007fa7bd7677d3 in __recvfrom_nocancel () from /lib64/libc.so.6
#1  0x00000000004d6a45 in udp_rcv_loop () at core/udp_server.c:457
#2  0x0000000000429d47 in main_loop () at main.c:1683
#3  0x0000000000433a96 in main (argc=12, argv=0x7ffd072aeb48) at main.c:2856

 
7221
#0  0x00007fa7bd7677d3 in __recvfrom_nocancel () from /lib64/libc.so.6
#1  0x00000000004d6a45 in udp_rcv_loop () at core/udp_server.c:457
#2  0x0000000000429d47 in main_loop () at main.c:1683
#3  0x0000000000433a96 in main (argc=12, argv=0x7ffd072aeb48) at main.c:2856
 

Possible Solutions

Suggested patch (was not compiled and tested yet):

diff --git a/src/modules/dialog/dlg_dmq.c b/src/modules/dialog/dlg_dmq.c
index a5628f4..b681759 100644
--- a/src/modules/dialog/dlg_dmq.c
+++ b/src/modules/dialog/dlg_dmq.c
@@ -646,22 +646,22 @@
 
 int dmq_send_all_dlgs(dmq_node_t* dmq_node) {
 	int index;
-	dlg_entry_t entry;
+	dlg_entry_t *entry;
 	dlg_cell_t *dlg;
 
 	LM_DBG("sending all dialogs \n");
 
 	for(index = 0; index< d_table->size; index++){
 		/* lock the whole entry */
-		entry = (d_table->entries)[index];
-		dlg_lock( d_table, &entry);
+		entry = &(d_table->entries)[index];
+		dlg_lock( d_table, entry);
 
-		for(dlg = entry.first; dlg != NULL; dlg = dlg->next){
+		for(dlg = entry->first; dlg != NULL; dlg = dlg->next){
 				dlg->dflags |= DLG_FLAG_CHANGED_PROF;
 				dlg_dmq_replicate_action(DLG_DMQ_UPDATE, dlg, 0, dmq_node);
 		}
 
-		dlg_unlock( d_table, &entry);
+		dlg_unlock( d_table, entry);
 	}
 
 	return 0;

Additional Information

kamailio 5.4.2 (x86_64/linux) 5bd72f

Centos 7
3.10.0-1127.19.1.el7.x86_64 #1 SMP Tue Aug 25 17:23:54 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux


You are receiving this because you are subscribed to this thread.
Reply to this email directly, view it on GitHub, or unsubscribe.