---------- Forwarded message ----------
From: Zunnun <zunnun@gmail.com>
Date: Wed, Mar 30, 2011 at 10:51 AM
Subject: crash & 100% CPU usage problem with kamailio 3.1.2
To: sr-dev@lists.sip-router.org


kamailio 3.1.2 issues

Problem 1:

Running heavy stress (for few hours), we have seen 100 % CPU usage 
Reason: the linked list is circular. next pointer points itself & the loop never breaks.

file: tcp_main.c

function: 

inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,struct ip_addr* l_ip, int l_port,int flags)

for (a=tcpconn_aliases_hash[hash], nxt=0; a; a=nxt){
                        nxt=a->next;

here a->next points to a & loop never breaks




Problem 2: 
kamailio process terminates (heavy stress for over 24 hours)

Reason: it calls abort()

file: tcp_main.c
function

inline static int tcpconn_chld_put(struct tcp_connection* tcpconn)
{
        if (unlikely(atomic_dec_and_test(&tcpconn->refcnt))){
                DBG("tcpconn_chld_put: destroying connection %p (%d, %d) "
                                "flags %04x\n", tcpconn, tcpconn->id,
                                tcpconn->s, tcpconn->flags);
                /* sanity checks */
                membar_read_atomic_op(); /* make sure we see the current flags */
                if (unlikely(!(tcpconn->flags & F_CONN_FD_CLOSED) ||
                        (tcpconn->flags &
                                (F_CONN_HASHED|F_CONN_MAIN_TIMER|
                                 F_CONN_READ_W|F_CONN_WRITE_W)) )){
                        LOG(L_CRIT, "BUG: tcpconn_chld_put: %p bad flags = %0x\n",
                                        tcpconn, tcpconn->flags);
                        abort(); //CALLS abort
                }
                _tcpconn_free(tcpconn); /* destroys also the wbuf_q if still present*/
                return 1;
        }
        return 0;
}


Problem 3: 
kamailio crashed (heavy stress, seen it twice after 4 days 8 hours)
Reason: Circular link list is bad, prev pointer is NULL & kamailio access it

#0  local_timer_list_expire (lt=0x82eea0, saved_ticks=1295476481) at local_timer.c:221
221                     _timer_rm_list(tl); /* detach */
(gdb) bt
#0  local_timer_list_expire (lt=0x82eea0, saved_ticks=1295476481) at local_timer.c:221
#1  local_timer_expire (lt=0x82eea0, saved_ticks=1295476481) at local_timer.c:250
#2  local_timer_run (lt=0x82eea0, saved_ticks=1295476481) at local_timer.c:274
#3  0x0000000000510c3e in tcp_timer_run () at tcp_main.c:4384
#4  tcp_main_loop () at tcp_main.c:4564
#5  0x0000000000469eba in main_loop () at main.c:1641
#6  0x000000000046c04f in main (argc=<value optimized out>, argv=0x7fff3d28a3c8) at main.c:2398

(gdb) print tl
$1 = <value optimized out>
(gdb) print h
$2 = (struct timer_head *) 0x855eb8
(gdb) print *h
$3 = {next = 0x0, prev = 0x2acbac5398b8}
(gdb) print *h->prev
$4 = {next = 0x0, prev = 0x855eb8, expire = 1295476481, initial_timeout = 1920, data = 0x2acbac5397d0, f = 0x4f8310 <tcpconn_main_timeout>, flags = 512, slow_idx = 0}
(gdb)

once prev pointer was NULL & next crash next pointer was NULL

Problem 4: 
kamailio process terminated (heavy stress, found it twice, after 16 hours)
Reason: it calls abort()

file : mem/q_malloc.c

function

void qm_free(struct qm_block* qm, void* p)

partial code:

#ifdef DBG_QM_MALLOC
        qm_debug_frag(qm, f);
        if (f->u.is_free){
                LOG(L_CRIT, "BUG: qm_free: freeing already freed pointer,"
                                " first free: %s: %s(%ld) - aborting\n",
                                f->file, f->func, f->line);
                abort(); //CALLS ABORT
                
        }
        MDBG("qm_free: freeing frag. %p alloc'ed from %s: %s(%ld)\n",
                        f, f->file, f->func, f->line);
#endif

problem 5:
infinite loop - log file is full of these messages 100% CPU at that time
/kamailio[21562]: : <core> [io_wait.h:617]: BUG: io_watch_del: invalid fd -1, not in [0, 2)
//kamailio[21562]: : <core> [tcp_read.c:1218]: ERROR: tcpconn_receive: handle_io: io_watch_del failed for 0x2acbac5397d0