Hi All,
We have recently encountered a challenging issue in our environment that I wanted to bring to your attention. A few months ago, we performed updates to our Docker and OpenSSL installations, upgrading OpenSSL to version 1.1.1u. Since then, we have been experiencing intermittent traffic blocking (TLS and UDP both for around 10 minutes and getting recovered) on some of our instances.
Prior to the OpenSSL upgrade, we had not observed any such issues. However, the situation has become more complex due to a substantial increase in traffic, with each instance now handling 4K-5K concurrent calls and a daily total of approximately 200,000 calls per instance.
Our initial suspicion was related to a Docker iptable issue. To investigate further, we conducted a GDB trace using "kamctl pstrap" and discovered specific log entries indicating potential issues within the "pthread_rwlock_wrlock" function. This has prompted us to explore whether the problems are linked to the OpenSSL upgrade.
We are considering upgrading Kamailio to the latest version, although this process typically requires a significant amount of time for proper planning and testing. Given the high volume of calls we are currently handling, we are also exploring the possibility of a temporary solution involving a rollback of the OpenSSL version to see if it resolves the issue.
Curiously, during a recent load test using SIPP, which simulated 10,000 concurrent calls with a rate of 100 calls per second, we did not encounter the same problem.
any hints or insights you may have that could help us pinpoint the root cause of the problem .
We do have a complete GDB trace available, but its size necessitates sharing a truncated version for now.
version: kamailio 5.4.3 (x86_64/linux) 0178a5
flags: USE_TCP, USE_TLS, USE_SCTP, TLS_HOOKS, USE_RAW_SOCKS, DISABLE_NAGLE, USE_MCAST, DNS_IP_HACK, SHM_MMAP, PKG_MALLOC, Q_MALLOC, F_MALLOC, TLSF_MALLOC, DBG_SR_MEMORY, USE_FUTEX, FAST_LOCK-ADAPTIVE_WAIT, USE_DNS_CACHE, USE_DNS_FAILOVER, USE_NAPTR, USE_DST_BLACKLIST, HAVE_RESOLV_RES
ADAPTIVE_WAIT_LOOPS 1024, MAX_RECV_BUFFER_SIZE 262144, MAX_URI_SIZE 1024, BUF_SIZE 65535, DEFAULT PKG_SIZE 8MB
poll method support: poll, epoll_lt, epoll_et, sigio_rt, select.
id: 0178a5
compiled on 12:36:56 Oct 22 2023 with gcc 4.8.5
Operating System Host :
LSB Version: :core-4.1-amd64:core-4.1-noarch
Distributor ID: Rocky
Description: Rocky Linux release 8.8 (Green Obsidian)
Release: 8.8
Codename: GreenObsidian
---start 154 -----------------------------------------------------
kamailio: No such file or directory.
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
0x00007fbf6f38a0c3 in __epoll_wait_nocancel () from /lib64/libc.so.6
$1 = 82
$2 = {pid = 154, unix_sock = 182, idx = 27, status = 1, rank = 48, desc = "tcp receiver (tcp:xxx.xxx.xxx.xx:5060) child=27", '\000' <repeats 81 times>}
#0 0x00007fbf6f38a0c3 in __epoll_wait_nocancel () from /lib64/libc.so.6
No symbol table info available.
#1 0x0000000000684858 in io_wait_loop_epoll (h=0xb41720 <io_w>, t=2, repeat=0) at core/io_wait.h:1039
n = 84
r = 0
fm = 0x2
revents = 8201656
__FUNCTION__ = "io_wait_loop_epoll"
#2 0x0000000000699e7a in tcp_receive_loop (unix_sock=186) at core/tcp_read.c:1997
__FUNCTION__ = "tcp_receive_loop"
#3 0x00000000005279e7 in tcp_init_children () at core/tcp_main.c:5134
r = 27
i = -1
reader_fd_1 = 186
pid = 0
si_desc = "tcp receiver (tcp:xxx.xx.xxx.xxx:5060)\000\000\000\355\217c\000\000\000\000\000\240\213{\001\001\000\000\000\017\000\000\000\000\000\000\000\250\a2o\276\177\000\000\200\b\231o\000\000\000\000\260\006\346\261\376\177\000\000\230\210w\000\000\000\000\000=\000\000\000>\000\000\000\270\016z\350\274\177\000\000\060;8o\002\000\000\000P\224\346\352\001\000\000"
si = 0x0
__FUNCTION__ = "tcp_init_children"
#4 0x000000000042ae66 in main_loop () at main.c:1771
i = 16
pid = 94
si = 0x0
si_desc = "udp receiver child=15 sock=xxxx.xxx.xxx.xx:5060\000\000\000\v\235\200", '\000' <repeats 13 times>, "P\r\346\261\376\177\000\000\270%}\000\000\000\000\000T\000\000\000\000\000\000\000\060;8o\277\177\000\000\343\207\201\000\000\000\000\000\277;8o\277\177\000\000 \271A\000\000\000\000\000@\374Ko\276\177\000"
nrprocs = 16
woneinit = 1
__FUNCTION__ = "main_loop"
#5 0x0000000000433ad6 in main (argc=9, argv=0x7ffeb1e60e38) at main.c:2856
cfg_stream = 0x16c9010
c = -1
r = 0
tmp = 0x7ffeb1e61f0c ""
tmp_len = 2496
port = 2496
proto = 1472
ahost = 0x0
aport = 0
options = 0x7d5238 ":f:cm:M:dVIhEeb:l:L:n:vKrRDTN:W:w:t:u:g:P:G:SQ:O:a:A:x:X:Y:"
ret = -1
seed = 2208520674
rfd = 4
debug_save = 0
debug_flag = 0
dont_fork_cnt = 0
n_lst = 0x7fbf6f40f3a0 <intel_02_known>
p = 0xf0b5ff <Address 0xf0b5ff out of bounds>
st = {st_dev = 76, st_ino = 120506675, st_nlink = 2, st_mode = 16877, st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 40, st_blksize = 4096, st_blocks = 0, st_atim = {tv_sec = 1698340416, tv_nsec = 618631009}, st_mtim = {tv_sec = 1698340416, tv_nsec = 618631009}, st_ctim = {tv_sec = 1698340416, tv_nsec = 618631009}, __unused = {0, 0, 0}}
tbuf = '\000' <repeats 376 times>...
option_index = 0
long_options = {{name = 0x7d742f "help", has_arg = 0, flag = 0x0, val = 104}, {name = 0x7d2a34 "version", has_arg = 0, flag = 0x0, val = 118}, {name = 0x7d7434 "alias", has_arg = 1, flag = 0x0, val = 1024}, {name = 0x7d743a "subst", has_arg = 1, flag = 0x0, val = 1025}, {name = 0x7d7440 "substdef", has_arg = 1, flag = 0x0, val = 1026}, {name = 0x7d7449 "substdefs", has_arg = 1, flag = 0x0, val = 1027}, {name = 0x7d7453 "server-id", has_arg = 1, flag = 0x0, val = 1028}, {name = 0x7d745d "loadmodule", has_arg = 1, flag = 0x0, val = 1029}, {name = 0x7d7468 "modparam", has_arg = 1, flag = 0x0, val = 1030}, {name = 0x7d7471 "log-engine", has_arg = 1, flag = 0x0, val = 1031}, {name = 0x7d747c "debug", has_arg = 1, flag = 0x0, val = 1032}, {name = 0x0, has_arg = 0, flag = 0x0, val = 0}}
__FUNCTION__ = "main"
[Inferior 1 (process 154) detached]
---end 154 -------------------------------------------------------
---start 155 -----------------------------------------------------
kamailio: No such file or directory.
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
0x00007fbe6e5bd39e in pthread_rwlock_wrlock () from /lib64/libpthread.so.0
$1 = 83
$2 = {pid = 155, unix_sock = -1, idx = -1, status = 1, rank = -4, desc = "tcp main process", '\000' <repeats 111 times>}
#0 0x00007fbe6e5bd39e in pthread_rwlock_wrlock () from /lib64/libpthread.so.0
No symbol table info available.
#1 0x00007fbe68b745c9 in CRYPTO_THREAD_write_lock () from /lib64/libcrypto.so.1.1
No symbol table info available.
#2 0x00007fbe68afe07e in CRYPTO_free_ex_data () from /lib64/libcrypto.so.1.1
No symbol table info available.
#3 0x00007fbe68ea52c2 in SSL_free () from /lib64/libssl.so.1.1
No symbol table info available.
#4 0x00007fbe69149480 in tls_h_tcpconn_clean_f (c=0x7fbceeae00d0) at tls_server.c:655
extra = 0x7fbceec3b898
__FUNCTION__ = "tls_h_tcpconn_clean_f"
#5 0x00000000004fbf2d in _tcpconn_free (c=0x7fbceeae00d0) at core/tcp_main.c:1529
__FUNCTION__ = "_tcpconn_free"
#6 0x0000000000510912 in tcpconn_put_destroy (tcpconn=0x7fbceeae00d0) at core/tcp_main.c:3284
__FUNCTION__ = "tcpconn_put_destroy"
#7 0x00000000005156a5 in handle_tcp_child (tcp_c=0x7fbe6f813440, fd_i=-1) at core/tcp_main.c:3723
tcpconn = 0x7fbceeae00d0
response = {140449434960080, -2}
cmd = -2
bytes = 16
n = 1865551792
t = 887186228
crt_timeout = 410
con_lifetime = 57680
__FUNCTION__ = "handle_tcp_child"
#8 0x00000000005208ed in handle_io (fm=0x7fbe6f9d7c78, ev=1, idx=-1) at core/tcp_main.c:4542
ret = 1
__FUNCTION__ = "handle_io"
#9 0x00000000004e93ed in io_wait_loop_epoll (h=0xad5740 <io_h>, t=5, repeat=0) at core/io_wait.h:1070
n = 1
r = 0
fm = 0x7fbe6f9d7c78
revents = 1
__FUNCTION__ = "io_wait_loop_epoll"
#10 0x0000000000523c99 in tcp_main_loop () at core/tcp_main.c:4827
si = 0x0
r = 28
__FUNCTION__ = "tcp_main_loop"
#11 0x000000000042b1be in main_loop () at main.c:1779
i = 16
pid = 0
si = 0x0
si_desc = "udp receiver child=15 sock=xxx.xx.xxx.xxx:5060\000\000\000\v\235\200", '\000' <repeats 13 times>, "P\r\346\261\376\177\000\000\270%}\000\000\000\000\000T\000\000\000\000\000\000\000\060;8o\277\177\000\000\343\207\201\000\000\000\000\000\277;8o\277\177\000\000 \271A\000\000\000\000\000@\374Ko\276\177\000"
nrprocs = 16
woneinit = 1
__FUNCTION__ = "main_loop"
#12 0x0000000000433ad6 in main (argc=9, argv=0x7ffeb1e60e38) at main.c:2856
cfg_stream = 0x16c9010
c = -1
r = 0
tmp = 0x7ffeb1e61f0c ""
tmp_len = 2496
port = 2496
proto = 1472
ahost = 0x0
aport = 0
options = 0x7d5238 ":f:cm:M:dVIhEeb:l:L:n:vKrRDTN:W:w:t:u:g:P:G:SQ:O:a:A:x:X:Y:"
ret = -1
seed = 2208520674
rfd = 4
debug_save = 0
debug_flag = 0
dont_fork_cnt = 0
n_lst = 0x7fbf6f40f3a0 <intel_02_known>
p = 0xf0b5ff <Address 0xf0b5ff out of bounds>
st = {st_dev = 76, st_ino = 120506675, st_nlink = 2, st_mode = 16877, st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 40, st_blksize = 4096, st_blocks = 0, st_atim = {tv_sec = 1698340416, tv_nsec = 618631009}, st_mtim = {tv_sec = 1698340416, tv_nsec = 618631009}, st_ctim = {tv_sec = 1698340416, tv_nsec = 618631009}, __unused = {0, 0, 0}}
tbuf = '\000' <repeats 376 times>...
option_index = 0
long_options = {{name = 0x7d742f "help", has_arg = 0, flag = 0x0, val = 104}, {name = 0x7d2a34 "version", has_arg = 0, flag = 0x0, val = 118}, {name = 0x7d7434 "alias", has_arg = 1, flag = 0x0, val = 1024}, {name = 0x7d743a "subst", has_arg = 1, flag = 0x0, val = 1025}, {name = 0x7d7440 "substdef", has_arg = 1, flag = 0x0, val = 1026}, {name = 0x7d7449 "substdefs", has_arg = 1, flag = 0x0, val = 1027}, {name = 0x7d7453 "server-id", has_arg = 1, flag = 0x0, val = 1028}, {name = 0x7d745d "loadmodule", has_arg = 1, flag = 0x0, val = 1029}, {name = 0x7d7468 "modparam", has_arg = 1, flag = 0x0, val = 1030}, {name = 0x7d7471 "log-engine", has_arg = 1, flag = 0x0, val = 1031}, {name = 0x7d747c "debug", has_arg = 1, flag = 0x0, val = 1032}, {name = 0x0, has_arg = 0, flag = 0x0, val = 0}}
__FUNCTION__ = "main"
[Inferior 1 (process 155) detached]
---end 155 -------------------------------------------------------