### Description
Segfault , suspected cause, writing out of bound of an array
### Troubleshooting
In progress
#### Reproduction
Hard
#### Debugging Data
``` /* code reference */ typedef struct _ds_set { int id; /*!< id of dst set */ int nr; /*!< number of items in dst set */ int last; /*!< last used item in dst set (round robin) */ int wlast; /*!< last used item in dst set (by weight) */ int rwlast; /*!< last used item in dst set (by relative weight) */ ds_dest_t *dlist; unsigned int wlist[100]; unsigned int rwlist[100]; struct _ds_set *next[2]; int longer; gen_lock_t lock; } ds_set_t;
Here we can see that next is having invalid value (in fact it should have been 0/NULL in this case) : 20000000220000000220
2964>->-->--ds_ping_set(node->next[i]); (gdb) bt #0 0x00007f3b1cfde6c7 in ds_ping_set (node=0x200000002) at dispatch.c:2964 #1 0x00007f3b1cfde6d3 in ds_ping_set (node=0x7f3a99a09fc8) at dispatch.c:2964 #2 0x00007f3b1cfde6d3 in ds_ping_set (node=0x7f3a99a09828) at dispatch.c:2964 #3 0x00007f3b1cfdf9ad in ds_check_timer (ticks=9987101, param=0x0) at dispatch.c:3022 #4 0x00005644376a3652 in sr_wtimer_exec (ticks=9987101, param=0x0) at core/timer_proc.c:390 #5 0x00005644376a276d in fork_sync_timer (child_id=-1, desc=0x5644378904c1 "secondary timer", make_sock=1, f=0x5644376a330c <sr_wtimer_exec>, param=0x0, interval=1000) at core/timer_proc.c:224 #6 0x00005644376a39ca in sr_wtimer_start () at core/timer_proc.c:416 #7 0x00005644374d2d59 in main_loop () at main.c:1702 #8 0x00005644374da171 in main (argc=12, argv=0x7ffe7c214ac8) at main.c:2650 (gdb) p (ds_set_t) *0x7f3a99a09828 $1 = {id = 2, nr = 2, last = 0, wlast = 0, rwlast = 0, dlist = 0x7f3a99a0aab8, wlist = {0 <repeats 100 times>}, rwlist = {0 <repeats 100 times>}, next = {0x7f3a99a09fc8, 0x0}, longer = 0, lock = {val = 0}} (gdb) p (ds_set_t) *0x0x7f3a99a09fc8 Invalid number "0x0x7f3a99a09fc8". (gdb) p (ds_set_t) *0x7f3a99a09fc8 $2 = {id = 1, nr = 3, last = 0, wlast = 0, rwlast = 0, dlist = 0x7f3a99a0a7f8, wlist = {1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,~ 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1}, rwlist = {1, 1, 2, 2, 1, 0, 1, 0, 2, 2, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 1, 2, 1, 0, 0, 0, 0, 2, 1, 0, 2, 1, 2, 1, 1, 0, 2, 1, 2, 2, 2, 0, 0,~ 2, 0, 2, 2, 0, 2, 2, 0, 1, 2, 1, 1, 2, 1, 1, 0, 1, 1, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 1, 0, 2, 1, 1, 2, 0, 2, 1, 2, 1, 1, 0, 2, 1, 2, 2, 1, 2, 1, 1, 0}, next = {0x200000002, 0x200000002}, longer = 2, lock = {val = 0}} (gdb) ```
### Possible Solutions
Further analysis of the relevant source code around dp_init_relative_weights() and the way it was reused with congestion control.
more precisely, looks like 5 x ints with value 2 where written out of bound 2,1,2,1,1,0 2 1 2 1 1 0 222220 overflow
``` (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*1) $53 = 0 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*2) $54 = 2 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*3) $55 = 2 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*4) $56 = 2 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*5) $57 = 2 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*6) $58 = 2 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*7) $59 = 0 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*8) $60 = 1 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*9) $61 = 1 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*10) $62 = 2 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*11) $63 = 1 (gdb) p (int)*(0x7f3a99a09fc8 + sizeof(struct _ds_set) - sizeof(unsigned int)*12) $64 = 2 ```
The lock was set to 0 else it would have been 48 times 2
``` /* if the array was not completely filled (i.e., the sum of rweights is * less than 100 due to truncated), then use last address to fill the rest */ unsigned int last_insert = t > 0 ? dset->rwlist[t - 1] : (unsigned int)(dset->nr - 1); for(j = t; j < 100; j++) dset->rwlist[j] = last_insert;
/* shuffle the content of the array in order to mix the selection * of the addresses (e.g., if first address has weight=20, avoid * sending first 20 calls to it, but ensure that within a 100 calls, * 20 go to first address */ shuffle_uint100array(dset->rwlist); lock_release(&dset->lock); return 0; } ```
Re run the code in isolation using the inputs present
``` ~/git/examples$ valgrind ./bin/shuffle ==5245== Memcheck, a memory error detector ==5245== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al. ==5245== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info ==5245== Command: ./bin/shuffle ==5245== rw_sum[0][143][33] rw_sum[1][143][33] rw_sum[2][143][32] redistributed array 0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|0|0| fill the truncate t[98] with[2] 0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2| shuffled array 2|2|2|1|1|0|1|1|2|1|1|0|0|0|2|1|2|1|1|2|1|0|2|2|1|0|0|0|1|0|2|2|1|0|0|2|2|2|0|1|0|2|0|1|2|0|2|2|1|0|2|0|0|1|1|0|2|1|2|1|0|2|1|2|1|2|1|2|0|0|2|1|0|0|2|1|2|0|0|2|1|0|0|2|1|1|1|1|2|0|1|1|1|2|0|2|0|0|2|0| ==5245== ==5245== HEAP SUMMARY: ==5245== in use at exit: 0 bytes in 0 blocks ==5245== total heap usage: 1 allocs, 1 frees, 1,024 bytes allocated ==5245== ==5245== All heap blocks were freed -- no leaks are possible ==5245== ==5245== For counts of detected and suppressed errors, rerun with: -v ==5245== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0) ```
shuffle_uint100array looks good, seems like the corruption took place before the lock was released since the int was set back to 0, hmm, still thinking ...
quite sure I found it !
``` /** * Initialize the relative weight distribution for a destination set * - fill the array of 0..99 elements where to keep the index of the * destination address to be used. The Nth call will use * the address with the index at possition N%100 */ int dp_init_relative_weights(ds_set_t *dset) { int j; int k; int t;
if(dset == NULL || dset->dlist == NULL) return -1;
lock_get(&dset->lock); int rw_sum = 0; /* find the sum of relative weights*/ for(j = 0; j < dset->nr; j++) { // READING THE FLAG ONCE if(ds_skip_dst(dset->dlist[j].flags)) continue; rw_sum += dset->dlist[j].attrs.rweight; }
if(rw_sum == 0) { lock_release(&dset->lock); return 0; }
/* fill the array based on the relative weight of each destination */ t = 0; for(j = 0; j < dset->nr; j++) { if(ds_skip_dst(dset->dlist[j].flags)) // READING THE FLAG AGAIN, SEGFAULT IF THEY CHANGED ! continue;
int current_slice = dset->dlist[j].attrs.rweight * 100 / rw_sum; //truncate here; LM_DBG("rw_sum[%d][%d][%d]\n",j, rw_sum, current_slice); for(k = 0; k < current_slice; k++) { dset->rwlist[t] = (unsigned int)j; t++; } }
/* if the array was not completely filled (i.e., the sum of rweights is * less than 100 due to truncated), then use last address to fill the rest */ unsigned int last_insert = t > 0 ? dset->rwlist[t - 1] : (unsigned int)(dset->nr - 1); for(j = t; j < 100; j++) dset->rwlist[j] = last_insert;
/* shuffle the content of the array in order to mix the selection * of the addresses (e.g., if first address has weight=20, avoid * sending first 20 calls to it, but ensure that within a 100 calls, * 20 go to first address */ shuffle_uint100array(dset->rwlist); lock_release(&dset->lock); return 0; } ```
``` rw_sum[0][96][50] rw_sum[1][96][50] rw_sum[2][96][48] redistributed array 0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1| fill the truncate t[148] with[2] 0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1| shuffled array 1|0|1|1|0|0|0|1|1|1|1|1|1|1|0|1|1|0|1|0|0|1|0|0|0|0|0|0|1|1|1|1|1|1|1|0|0|1|1|0|0|1|1|0|0|1|0|0|0|1|1|0|1|0|0|0|1|1|0|0|1|0|0|1|0|1|0|1|0|0|1|0|0|1|0|1|0|0|1|0|0|1|0|1|0|1|1|1|1|1|0|0|1|0|1|1|1|0|0|0| *** stack smashing detected ***: ./bin/shuffle terminated Aborted ```
Syncronization problem where we read a value twice that could change in between. I even found 48 offset in this case !
I guess you are going to make a pull request from your cloned repository with the patches referred above.
Closed #1649.