doc/examples/runemomniaggdemo.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251

# this is a quick and dirty migration of runemomniagg2.sh to the 
# --enable-demo mode of aggregate testing
function kill_netperfs {
    pkill -ALRM netperf

    pgrep -P 1 -f netperf > /dev/null
    while [ $? -eq 0 ]
    do
	sleep 1
	pgrep -P 1 -f netperf > /dev/null
    done
}

function run_cmd {

    NOW=`date +%s.%N`
    echo "Starting netperfs at $NOW for $TEST" | tee $TESTLOG
    i=0;

# the starting point for our load level pauses
    PAUSE_AT=1


    while [ $i -lt $MAX_INSTANCES ]
    do
	TARGET=${REMOTE_HOSTS[`expr $i % $NUM_REMOTE_HOSTS`]}
	echo "Starting netperfs on localhost targeting ${TARGET} for $TEST" | tee -a $TESTLOG
	id=`printf "%.5d" $i`
	$NETPERF -H $TARGET $NETPERF_CMD 2>&1 > netperf_${TEST}_${id}_to_${TARGET}.out &

    # give it a moment to get going
	sleep 1

	i=`expr $i + 1`

	if [ $i  -eq $PAUSE_AT ] && [ $i -ne $MAX_INSTANCES ]
	then
	    NOW=`date +%s.%N`
	    echo "Pausing for $DURATION seconds at $NOW with $i netperfs running for $TEST" | tee -a $TESTLOG
	    sleep $DURATION
	    PAUSE_AT=`expr $PAUSE_AT \* 2`
	    NOW=`date +%s.%N`
	    echo "Resuming at $NOW for $TEST" | tee -a $TESTLOG
	fi    
    done

    NOW=`date +%s.%N`
    echo "Netperfs started by $NOW for $TEST" | tee -a $TESTLOG

#wait for our test duration
    sleep $DURATION

#kludgey but this sleep should mean that another interim result will be emitted
    sleep 3

# stop all the netperfs
    NOW=`date +%s.%N`
    echo "Netperfs stopping $NOW for $TEST" | tee -a $TESTLOG
    kill_netperfs
    
    NOW=`date +%s.%N`
    echo "Netperfs stopped $NOW for $TEST" | tee -a $TESTLOG

}

# very much like run_cmd, but it runs the tests one at a time rather
# than in parallel.  We keep the same logging strings to be compatible
# (hopefully) with the post processing script, even though they don't
# make all that much sense :)

function run_cmd_serial {

    NOW=`date +%s.%N`
    echo "Starting netperfs at $NOW for $TEST" | tee $TESTLOG
    i=0;

# the starting point for our load level pauses
    PAUSE_AT=1


    while [ $i -lt $NUM_REMOTE_HOSTS ]
    do
	TARGET=${REMOTE_HOSTS[`expr $i % $NUM_REMOTE_HOSTS`]}
	echo "Starting netperfs on localhost targeting ${TARGET} for $TEST" | tee -a $TESTLOG
	id=`printf "%.5d" $i`
	$NETPERF -H $TARGET $NETPERF_CMD 2>&1 > netperf_${TEST}_${id}_to_${TARGET}.out &

    # give it a moment to get going
	sleep 1

	i=`expr $i + 1`

	NOW=`date +%s.%N`
	echo "Pausing for $DURATION seconds at $NOW with $i netperfs running for $TEST" | tee -a $TESTLOG
	# the plus two is to make sure we have a full set of interim
	# results.  probably not necessary here but we want to be
	# certain
	sleep `expr $DURATION + 1`
	kill_netperfs
	NOW=`date +%s.%N`
	THEN=`echo $NOW | awk -F "." '{printf("%d.%d",$1-1,$2)}'`
	echo "Resuming at $THEN for $TEST" | tee -a $TESTLOG

    done

    NOW=`date +%s.%N`
    echo "Netperfs started by $NOW for $TEST" | tee -a $TESTLOG

# stop all the netperfs - of course actually they have all been
# stopped already, we just want the log entries
    NOW=`date +%s.%N`
    echo "Netperfs stopping $NOW for $TEST" | tee -a $TESTLOG
    kill_netperfs
    NOW=`date +%s.%N`
    echo "Netperfs stopped $NOW for $TEST" | tee -a $TESTLOG
}

# here then is the "main" part

if [ ! -f ./remote_hosts ]
then
    echo "This script requires a remote_hosts file"
    exit -1
fi
. ./remote_hosts

# how many processors are there on this system
NUM_CPUS=`grep processor /proc/cpuinfo | wc -l`

# the number of netperf instances we will run will be up to 2x the
# number of CPUs
MAX_INSTANCES=`expr $NUM_CPUS \* 2`

# but at least as many as there are entries in remote_hosts
if [ $MAX_INSTANCES -lt $NUM_REMOTE_HOSTS ]
then
    MAX_INSTANCES=$NUM_REMOTE_HOSTS
fi

# allow the netperf binary to be used to be overridden 
NETPERF=${NETPERF:="netperf"}

if [ $NUM_REMOTE_HOSTS -lt 2 ]
then
    echo "The list of remote hosts is too short.  There must be at least 2."
    exit -1
fi

# we assume that netservers are already running on all the load generators

DURATION=120
# do not have a uuidgen? then use the one in netperf
MY_UUID=`uuidgen`
# with top-of-trunk we could make this 0 and run forever
# but two hours is something of a failsafe if the signals
# get lost
LENGTH="-l 7200"
OUTPUT="-o all"

DO_STREAM=1;
DO_MAERTS=1;
# NOTE!  The Bidir test depends on being able to set a socket buffer
# size greater than 13 * 64KB or 832 KB or there is a risk of the test
# hanging.  If you are running linux, make certain that
# net.core.[r|w]mem_max are sufficiently large
DO_BIDIR=1;
DO_RRAGG=1;
DO_RR=1;
DO_ANCILLARY=1;

# UDP_RR for TPC/PPS using single-byte transactions. we do not use
# TCP_RR any longer because any packet losses or other matters
# affecting the congestion window will break our desire that there be
# a one to one correspondence between requests/responses and packets.
if [ $DO_RRAGG -eq 1 ]; then
    BURST=`find_max_burst.sh ${REMOTE_HOSTS[0]}`
    if [ $BURST -eq -1 ]; then
        # use a value that find_max_burst will not have picked
        BURST=9
        echo "find_max_burst.sh returned -1 so picking a burst of $BURST"
    fi
    TEST="tps"
    TESTLOG="netperf_tps.log"
    NETPERF_CMD="-D 0.5 -c -C -f x -P 0 -t omni $LENGTH -v 2 -- -r 1 -b $BURST -e 1 -T udp -u $MY_UUID $OUTPUT"
    run_cmd
fi

# Bidirectional using burst-mode TCP_RR and large request/response size
if [ $DO_BIDIR -eq 1 ]; then
    TEST="bidirectional"
    TESTLOG="netperf_bidirectional.log"
    NETPERF_CMD="-D 0.5 -c -C -f m -P 0 -t omni $LENGTH -v 2 -- -r 64K -s 1M -S 1M -b 12 -u $MY_UUID $OUTPUT"
    run_cmd
fi

# TCP_STREAM aka outbound with a 64K send size
# the netperf command is everything but netperf -H mumble
if [ $DO_STREAM -eq 1 ];then
    TEST="outbound"
    TESTLOG="netperf_outbound.log"
    NETPERF_CMD="-D 0.5 -c -C -f m -P 0 -t omni $LENGTH -v 2 -- -m 64K -u $MY_UUID $OUTPUT"
    run_cmd
fi

# TCP_MAERTS aka inbound with a 64K send size - why is this one last?
# because presently when I pkill the netperf of a "MAERTS" test, the
# netserver does not behave well and it may not be possible to get it
# to behave well.  but we will still have all the interim results even
# if we don't get the final results, the useful parts of which will be
# the same as the other tests anyway
if [ $DO_MAERTS -eq 1 ]; then
    TEST="inbound"
    TESTLOG="netperf_inbound.log"
    NETPERF_CMD="-D 0.5 -c -C -f m -P 0 -t omni $LENGTH -v 2 -- -m ,64K -u $MY_UUID $OUTPUT"
    run_cmd
fi

# A single-stream of synchronous, no-burst TCP_RR in an "aggregate"
# script?  Yes, because the way the aggregate tests work, while there
# is a way to see what the performance of a single bulk transfer was,
# there is no way to see a basic latency - by the time
# find_max_burst.sh has completed, we are past a burst size of 0
if [ $DO_RR -eq 1 ]; then
    if [ $DURATION -lt 60 ]; then
	DURATION=60
    fi
    TEST="sync_tps"
    TESTLOG="netperf_sync_tps.log"
    NETPERF_CMD="-D 0.5 -c -C -f x -P 0 -t omni $LENGTH -v 2 -- -r 1 -u $MY_UUID $OUTPUT"
    run_cmd_serial
fi


# now some ancillary things which may nor may not work on your platform
if [ $DO_ANCILLARY -eq 1 ];then
    dmidecode 2>&1 > dmidecode.txt
    uname -a 2>&1 > uname.txt
    cat /proc/cpuinfo 2>&1 > cpuinfo.txt
    cat /proc/meminfo 2>&1 > meminfo.txt
    ifconfig -a 2>&1 > ifconfig.txt
    netstat -rn 2>&1 > netstat.txt
    dpkg -l 2>&1 > dpkg.txt
    rpm -qa 2>&1 > rpm.txt
    cat /proc/interrupts 2>&1 > interrupts.txt
    i=0
    while [ $i -lt `expr $NUM_REMOTE_HOSTS - 1` ]
    do
	traceroute ${REMOTE_HOSTS[$i]} > traceroute_${REMOTE_HOSTS[$i]}.txt
	i=`expr $i + 1`
    done
fi