Steps: (1) 9 users and 1 mcu registered the server (2) All 9 users connect to the mcu through the server (3) For a while, some users showed registration fail, we found Kamailio does not respond to any further SIP message.
I checked this using `netstat -apn` and it showed:
``` Active Internet connections (servers and established) Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name tcp 0 0 10.100.92.42:5060 0.0.0.0:* LISTEN 9384/kamailio tcp 0 0 10.100.92.42:5061 0.0.0.0:* LISTEN 9384/kamailio tcp 48963 0 10.100.92.42:5060 10.100.92.42:42177 ESTABLISHED 9375/kamailio tcp 29621 0 10.100.92.42:5060 10.100.92.40:34392 ESTABLISHED 9375/kamailio tcp 1617 0 10.100.92.42:43997 10.100.92.42:5062 ESTABLISHED 9375/kamailio tcp 2506 0 10.100.92.42:5060 10.100.92.68:37386 ESTABLISHED 9383/kamailio udp 0 0 127.0.0.1:50180 127.0.0.1:7722 ESTABLISHED 9384/kamailio udp 0 0 127.0.0.1:47125 127.0.0.1:7722 ESTABLISHED 9378/kamailio udp 0 0 127.0.0.1:55318 127.0.0.1:7722 ESTABLISHED 9375/kamailio udp 0 0 127.0.0.1:44595 127.0.0.1:7722 ESTABLISHED 9382/kamailio udp 0 0 127.0.0.1:41045 127.0.0.1:7722 ESTABLISHED 9383/kamailio udp 0 0 127.0.0.1:52901 127.0.0.1:7722 ESTABLISHED 9380/kamailio udp 0 0 127.0.0.1:39078 127.0.0.1:7722 ESTABLISHED 9376/kamailio udp 0 0 127.0.0.1:36548 127.0.0.1:7722 ESTABLISHED 9377/kamailio udp 0 0 127.0.0.1:55005 127.0.0.1:7722 ESTABLISHED 9374/kamailio udp 0 0 127.0.0.1:53049 127.0.0.1:7722 ESTABLISHED 9381/kamailio udp 0 0 127.0.0.1:44358 127.0.0.1:7722 ESTABLISHED 9374/kamailio udp 0 0 127.0.0.1:60253 127.0.0.1:7722 ESTABLISHED 9379/kamailio udp 0 0 10.100.92.42:5060 0.0.0.0:* 9374/kamailio udp 0 0 ::ffff:127.0.0.1:7722 :::* 9354/rtpengine raw 0 0 0.0.0.0:255 0.0.0.0:* 255 9374/kamailio ```
And I attached the gdb to the those processed which Recv-Q value were high, it showed:
PID [9375] ``` #0 0x400a7450 in recvfrom () from /lib/libpthread.so.0 No symbol table info available. #1 0x00265848 in udp_rcv_loop () at udp_server.c:446 len = 472 buf = "REGISTER sip:10.100.92.42:5060 SIP/2.0\r\nVia: SIP/2.0/UDP 10.100.91.13:5060;rport;branch=z9hG4bK649620372\r\nFrom: sip:DEMOROOM_2@10.100.92.42;tag=658334621\r\nTo: sip:DEMOROOM_2@10.100.92.42\r\nCall-ID:"... tmp = 0x49fbfc "10.100.92.43" from = 0x402d5b24 fromlen = 16 ri = {src_ip = {af = 2, len = 4, u = {addrl = {224093194, 3200776404, 1935216, 1734536862}, addr32 = {224093194, 3200776404, 1935216, 1734536862}, addr16 = {25610, 3419, 63700, 48839, 34672, 29, 61086, 26466}, addr = "\nd[\r\324\370\307\276p\207\035\000\236\356bg"}}, dst_ip = {af = 2, len = 4, u = {addrl = {710697994, 0, 0, 0}, addr32 = {710697994, 0, 0, 0}, addr16 = {25610, 10844, 0, 0, 0, 0, 0, 0}, addr = "\nd\*", '\000' <repeats 11 times>}}, src_port = 5060, dst_port = 5060, proto_reserved1 = 0, proto_reserved2 = 0, src_su = {s = {sa_family = 2, sa_data = "\023\304\nd[\r\000\000\000\000\000\000\000"}, sin = { sin_family = 2, sin_port = 50195, sin_addr = {s_addr = 224093194}, sin_zero = "\000\000\000\000\000\000\000"}, sin6 = {sin6_family = 2, sin6_port = 50195, sin6_flowinfo = 224093194, sin6_addr = {__in6_u = {__u6_addr8 = '\000' <repeats 15 times>, __u6_addr16 = {0, 0, 0, 0, 0, 0, 0, 0}, __u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}, bind_address = 0x40261e14, proto = 1 '\001'} __FUNCTION__ = "udp_rcv_loop" #2 0x000c90a0 in main_loop () at main.c:1629 i = 0 pid = 0 si = 0x40261e14 si_desc = "udp receiver child=0 sock=10.100.92.42:5060\000\244\372\307\276\304y2A\000\000,\000\000\000X\000 \000\062@ \313'@h\t;\000\230\325&@\000\000\000\000x\021\002\000\060K1@\000\000\000\000d\372\307\276\260\275'\000\230\331\070\000X\311\070\000\030\000\000\000\000PVA\000\000\000\000\000\000\a@Lu\223A" r = 1074200576 nrprocs = 1 __FUNCTION__ = "main_loop" #3 0x000d0830 in main (argc=5, argv=0xbec7fd04) at main.c:2581 cfg_stream = 0x4b5008 c = -1 r = 0 tmp = 0x40070ac4 "P\002\006@\a" tmp_len = -1094190232 port = 1074045560 proto = -1094190164 options = 0x383880 ":f:cm:M:dVIhEeb:l:L:n:vKrRDTN:W:w:t:u:g:P:G:SQ:O:a:A:" ret = -1 seed = 3985271057 rfd = 206 debug_save = 0 debug_flag = 0 dont_fork_cnt = 0 n_lst = 0x0 p = 0x5 <Address 0x5 out of bounds> __FUNCTION__ = "main" ```
PID [9383]
``` #0 0x400a6a04 in __lll_lock_wait () from /lib/libpthread.so.0 #1 0x400a10b8 in pthread_mutex_lock () from /lib/libpthread.so.0 #2 0x002145b4 in tcpconn_do_send (fd=21, c=0x41969534, buf=0x402cbcb0 "SIP/2.0 200 Keepalive\r\nVia: SIP/2.0/TCP 10.100.92.42:5062;rport=42177;branch=z9hG4bK416855936;received=10.100.92.42\r\nFrom: sip:10.100.92.42:42177;tag=1996784611\r\nTo: sip:10.100.92.42:5060;tag=e926"..., len=340, send_flags=..., resp=0xbec7daf8, locked=0) at tcp_main.c:2520 #3 0x002130f0 in tcpconn_send_put (c=0x41969534, buf=0x402cbcb0 "SIP/2.0 200 Keepalive\r\nVia: SIP/2.0/TCP 10.100.92.42:5062;rport=42177;branch=z9hG4bK416855936;received=10.100.92.42\r\nFrom: sip:10.100.92.42:42177;tag=1996784611\r\nTo: sip:10.100.92.42:5060;tag=e926"..., len=340, send_flags=...) at tcp_main.c:2377 #4 0x002105c0 in tcp_send (dst=0xbec7e38c, from=0x0, buf=0x402cbcb0 "SIP/2.0 200 Keepalive\r\nVia: SIP/2.0/TCP 10.100.92.42:5062;rport=42177;branch=z9hG4bK416855936;received=10.100.92.42\r\nFrom: sip:10.100.92.42:42177;tag=1996784611\r\nTo: sip:10.100.92.42:5060;tag=e926"..., len=340) at tcp_main.c:2133 #5 0x40e33164 in msg_send (dst=0xbec7e38c, buf=0x402cbcb0 "SIP/2.0 200 Keepalive\r\nVia: SIP/2.0/TCP 10.100.92.42:5062;rport=42177;branch=z9hG4bK416855936;received=10.100.92.42\r\nFrom: sip:10.100.92.42:42177;tag=1996784611\r\nTo: sip:10.100.92.42:5060;tag=e926"..., len=340) at ../../forward.h:225 #6 0x40e362e8 in sl_reply_helper (msg=0x40342814, code=200, reason=0x402e1d6c "Keepalive", tag=0x0) at sl_funcs.c:224 #7 0x40e373e0 in sl_send_reply (msg=0x40342814, code=200, reason=0x402e1d6c "Keepalive") at sl_funcs.c:327 #8 0x40e2b30c in w_sl_send_reply (msg=0x40342814, p1=0x40282108 "\244\324(@\001", p2=0x40282140 "d\320(@ ") at sl.c:236 #9 0x00029b90 in do_action (h=0xbec7f110, a=0x4028cf34, msg=0x40342814) at action.c:1100 #10 0x00036ce8 in run_actions (h=0xbec7f110, a=0x4028cf34, msg=0x40342814) at action.c:1583 #11 0x00029a2c in do_action (h=0xbec7f110, a=0x4028d120, msg=0x40342814) at action.c:1079 #12 0x00036ce8 in run_actions (h=0xbec7f110, a=0x40288ed8, msg=0x40342814) at action.c:1583 #13 0x000263ac in do_action (h=0xbec7f110, a=0x40279db8, msg=0x40342814) at action.c:712 #14 0x00036ce8 in run_actions (h=0xbec7f110, a=0x4027321c, msg=0x40342814) at action.c:1583 #15 0x00037538 in run_top_route (a=0x4027321c, msg=0x40342814, c=0x0) at action.c:1669 #16 0x0013ce80 in receive_msg ( buf=0x41969714 "OPTIONS sip:10.100.92.42:5060 SIP/2.0\r\nVia: SIP/2.0/TCP 10.100.92.42:5062;rport;branch=z9hG4bK416855936\r\nFrom: sip:10.100.92.42:42177;tag=1996784611\r\nTo: sip:10.100.92.42:5060\r\nCall-ID: 2059319404"..., len=404, rcv_info=0x4196955c) at receive.c:216 #17 0x00248fbc in receive_tcp_msg ( tcpbuf=0x41969714 "OPTIONS sip:10.100.92.42:5060 SIP/2.0\r\nVia: SIP/2.0/TCP 10.100.92.42:5062;rport;branch=z9hG4bK416855936\r\nFrom: sip:10.100.92.42:42177;tag=1996784611\r\nTo: sip:10.100.92.42:5060\r\nCall-ID: 2059319404"..., len=404, rcv_info=0x4196955c, con=0x41969534) at tcp_read.c:1230 #18 0x0024c484 in tcp_read_req (con=0x41969534, bytes_read=0xbec7f714, read_flags=0xbec7f70c) at tcp_read.c:1431 #19 0x0024fdd8 in handle_io (fm=0x403321ac, events=1, idx=-1) at tcp_read.c:1655 #20 0x0023f814 in io_wait_loop_epoll (h=0x47d90c, t=2, repeat=0) at io_wait.h:1073 #21 0x00250fc8 in tcp_receive_loop (unix_sock=22) at tcp_read.c:1764 #22 0x00234674 in tcp_init_children () at tcp_main.c:4879 #23 0x000c9dec in main_loop () at main.c:1714 #24 0x000d0830 in main (argc=5, argv=0xbec7fd04) at main.c:2581 ```
In process 9375, I found it got stuck in recvfrom () (udp_server.c:446), and the code is:
`len=recvfrom(bind_address->socket, buf, BUF_SIZE, 0, &from->s,&fromlen);`
It seems operate using blocking mode instead of a non-blocking one, why??? Or maybe I miss a packet-check function like select() or poll()??
Thanks in advance. :)
--- Reply to this email directly or view it on GitHub: https://github.com/kamailio/kamailio/issues/510
About recvfrom part I think I got the answer from https://www.cs.cmu.edu/~srini/15-441/F01.full/www/assignments/P2/htmlsim_spl...
"...In an actual UNIX system, socket descriptors are set to be non-blocking using fcntl() with type O_NONBLOCK, and Recvfrom() returns errno EWOULDBLOCK when there is no data to be read on the non-blocking socket"
--- Reply to this email directly or view it on GitHub: https://github.com/kamailio/kamailio/issues/510#issuecomment-184673602
Closed #510.
--- Reply to this email directly or view it on GitHub: https://github.com/kamailio/kamailio/issues/510#event-552981647
About recvfrom part I think I got the answer from https://www.cs.cmu.edu/~srini/15-441/F01.full/www/assignments/P2/htmlsim_spl...
"...In an actual UNIX system, socket descriptors are set to be non-blocking using fcntl() with type O_NONBLOCK, and Recvfrom() returns errno EWOULDBLOCK when there is no data to be read on the non-blocking socket"
--- Reply to this email directly or view it on GitHub: https://github.com/kamailio/kamailio/issues/510#issuecomment-184990011
Reopened #510.
--- Reply to this email directly or view it on GitHub: https://github.com/kamailio/kamailio/issues/510#event-552981829
It is intended to have the udp workers waiting in recvfrom(). It is the kernel that passes the UDP messages when there is something received from the network and the application will get it via recvfrom(). If Kamailio stays in recvfrom() it means the kernel is not passing anything to it.
What is the operating system you are using?
Are registrations over udp?
Do you modules for traffic limiting or shaping like pike?
--- Reply to this email directly or view it on GitHub: https://github.com/kamailio/kamailio/issues/510#issuecomment-185093280
ARM Linux, registrations are over udp or tcp, and WITH_ANTIFLOOD is defined. No other modules related to traffic limit are used.
--- Reply to this email directly or view it on GitHub: https://github.com/kamailio/kamailio/issues/510#issuecomment-188628921
Back to this one -- so recvfrom() is intended to be blocking, each udp socket has its own pool of workers and if one is in recvfrom() that means it waits for udp traffic to come.
Was the traffic on udp not handled? I see the recv-q is high for tcp.
Have you noticed high cpu usage?
Also, if you have a lot of tcp traffic, try to increase the value for tcp_children.
--- You are receiving this because you are subscribed to this thread. Reply to this email directly or view it on GitHub: https://github.com/kamailio/kamailio/issues/510#issuecomment-204308110
Closing due to now follow up for long time. If still an issue, then reopen and add more details.
Closed #510.