aboutsummaryrefslogtreecommitdiff
path: root/afl-cmin.bash
diff options
context:
space:
mode:
authorhexcoder- <heiko@hexco.de>2020-01-22 19:07:02 +0100
committerhexcoder- <heiko@hexco.de>2020-01-22 19:07:02 +0100
commitce0b9dae5971f22cd0ae0b468322f78ee2a8a766 (patch)
tree135217f2c9b27dfff94e4de7f5d2c5494d566fb6 /afl-cmin.bash
parent7ce627c92e9b0536e254422d5ef604c3f58e43ce (diff)
downloadAFLplusplus-ce0b9dae5971f22cd0ae0b468322f78ee2a8a766.tar.gz
final step: rename afl-cmin to afl-cmin.bash and add a wrapper afl-cmin for afl-cmin.awk
Diffstat (limited to 'afl-cmin.bash')
-rwxr-xr-xafl-cmin.bash470
1 files changed, 470 insertions, 0 deletions
diff --git a/afl-cmin.bash b/afl-cmin.bash
new file mode 100755
index 00000000..1dd782d8
--- /dev/null
+++ b/afl-cmin.bash
@@ -0,0 +1,470 @@
+#!/usr/bin/env bash
+#
+# american fuzzy lop++ - corpus minimization tool
+# ---------------------------------------------
+#
+# Originally written by Michal Zalewski
+#
+# Copyright 2014, 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# This tool tries to find the smallest subset of files in the input directory
+# that still trigger the full range of instrumentation data points seen in
+# the starting corpus. This has two uses:
+#
+# - Screening large corpora of input files before using them as a seed for
+# afl-fuzz. The tool will remove functionally redundant files and likely
+# leave you with a much smaller set.
+#
+# (In this case, you probably also want to consider running afl-tmin on
+# the individual files later on to reduce their size.)
+#
+# - Minimizing the corpus generated organically by afl-fuzz, perhaps when
+# planning to feed it to more resource-intensive tools. The tool achieves
+# this by removing all entries that used to trigger unique behaviors in the
+# past, but have been made obsolete by later finds.
+#
+# Note that the tool doesn't modify the files themselves. For that, you want
+# afl-tmin.
+#
+# This script must use bash because other shells may have hardcoded limits on
+# array sizes.
+#
+
+echo "corpus minimization tool for afl-fuzz by Michal Zalewski"
+echo
+
+#########
+# SETUP #
+#########
+
+# Process command-line options...
+
+MEM_LIMIT=200
+TIMEOUT=none
+
+unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \
+ AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE
+
+while getopts "+i:o:f:m:t:eQUCh" opt; do
+
+ case "$opt" in
+
+ "h")
+ ;;
+
+ "i")
+ IN_DIR="$OPTARG"
+ ;;
+
+ "o")
+ OUT_DIR="$OPTARG"
+ ;;
+ "f")
+ STDIN_FILE="$OPTARG"
+ ;;
+ "m")
+ MEM_LIMIT="$OPTARG"
+ MEM_LIMIT_GIVEN=1
+ ;;
+ "t")
+ TIMEOUT="$OPTARG"
+ ;;
+ "e")
+ EXTRA_PAR="$EXTRA_PAR -e"
+ ;;
+ "C")
+ export AFL_CMIN_CRASHES_ONLY=1
+ ;;
+ "Q")
+ EXTRA_PAR="$EXTRA_PAR -Q"
+ test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
+ QEMU_MODE=1
+ ;;
+ "U")
+ EXTRA_PAR="$EXTRA_PAR -U"
+ test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
+ UNICORN_MODE=1
+ ;;
+ "?")
+ exit 1
+ ;;
+
+ esac
+
+done
+
+shift $((OPTIND-1))
+
+TARGET_BIN="$1"
+
+if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
+
+ cat 1>&2 <<_EOF_
+Usage: $0 [ options ] -- /path/to/target_app [ ... ]
+
+Required parameters:
+
+ -i dir - input directory with the starting corpus
+ -o dir - output directory for minimized files
+
+Execution control settings:
+
+ -f file - location read by the fuzzed program (stdin)
+ -m megs - memory limit for child process ($MEM_LIMIT MB)
+ -t msec - run time limit for child process (none)
+ -Q - use binary-only instrumentation (QEMU mode)
+ -U - use unicorn-based instrumentation (Unicorn mode)
+
+Minimization settings:
+
+ -C - keep crashing inputs, reject everything else
+ -e - solve for edge coverage only, ignore hit counts
+
+For additional tips, please consult docs/README.
+
+_EOF_
+ exit 1
+fi
+
+# Do a sanity check to discourage the use of /tmp, since we can't really
+# handle this safely from a shell script.
+
+if [ "$AFL_ALLOW_TMP" = "" ]; then
+
+ echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
+ T1="$?"
+
+ echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
+ T2="$?"
+
+ echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
+ T3="$?"
+
+ echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
+ T4="$?"
+
+ echo "$PWD" | grep -qE '^(/var)?/tmp/'
+ T5="$?"
+
+ if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
+ echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2
+ exit 1
+ fi
+
+fi
+
+# If @@ is specified, but there's no -f, let's come up with a temporary input
+# file name.
+
+TRACE_DIR="$OUT_DIR/.traces"
+
+if [ "$STDIN_FILE" = "" ]; then
+
+ if echo "$*" | grep -qF '@@'; then
+ STDIN_FILE="$TRACE_DIR/.cur_input"
+ fi
+
+fi
+
+# Check for obvious errors.
+
+if [ ! "$MEM_LIMIT" = "none" ]; then
+
+ if [ "$MEM_LIMIT" -lt "5" ]; then
+ echo "[-] Error: dangerously low memory limit." 1>&2
+ exit 1
+ fi
+
+fi
+
+if [ ! "$TIMEOUT" = "none" ]; then
+
+ if [ "$TIMEOUT" -lt "10" ]; then
+ echo "[-] Error: dangerously low timeout." 1>&2
+ exit 1
+ fi
+
+fi
+
+if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
+
+ TNEW="`which "$TARGET_BIN" 2>/dev/null`"
+
+ if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
+ echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
+ exit 1
+ fi
+
+ TARGET_BIN="$TNEW"
+
+fi
+
+if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$UNICORN_MODE" = "" ]; then
+
+ if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
+ echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
+ exit 1
+ fi
+
+fi
+
+if [ ! -d "$IN_DIR" ]; then
+ echo "[-] Error: directory '$IN_DIR' not found." 1>&2
+ exit 1
+fi
+
+test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
+
+find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
+rm -rf "$TRACE_DIR" 2>/dev/null
+
+rmdir "$OUT_DIR" 2>/dev/null
+
+if [ -d "$OUT_DIR" ]; then
+ echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
+ exit 1
+fi
+
+mkdir -m 700 -p "$TRACE_DIR" || exit 1
+
+if [ ! "$STDIN_FILE" = "" ]; then
+ rm -f "$STDIN_FILE" || exit 1
+ touch "$STDIN_FILE" || exit 1
+fi
+
+if [ "$AFL_PATH" = "" ]; then
+ SHOWMAP="${0%/afl-cmin}/afl-showmap"
+else
+ SHOWMAP="$AFL_PATH/afl-showmap"
+fi
+
+if [ ! -x "$SHOWMAP" ]; then
+ echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
+ rm -rf "$TRACE_DIR"
+ exit 1
+fi
+
+IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
+
+if [ "$IN_COUNT" = "0" ]; then
+ echo "[+] Hmm, no inputs in the target directory. Nothing to be done."
+ rm -rf "$TRACE_DIR"
+ exit 1
+fi
+
+FIRST_FILE=`ls "$IN_DIR" | head -1`
+
+# Make sure that we're not dealing with a directory.
+
+if [ -d "$IN_DIR/$FIRST_FILE" ]; then
+ echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
+ rm -rf "$TRACE_DIR"
+ exit 1
+fi
+
+# Check for the more efficient way to copy files...
+
+if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
+ CP_TOOL=ln
+else
+ CP_TOOL=cp
+fi
+
+# Make sure that we can actually get anything out of afl-showmap before we
+# waste too much time.
+
+echo "[*] Testing the target binary..."
+
+if [ "$STDIN_FILE" = "" ]; then
+
+ AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
+
+else
+
+ cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
+ AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
+
+fi
+
+FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
+
+if [ "$FIRST_COUNT" -gt "0" ]; then
+
+ echo "[+] OK, $FIRST_COUNT tuples recorded."
+
+else
+
+ echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
+ test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
+ exit 1
+
+fi
+
+# Let's roll!
+
+#############################
+# STEP 1: COLLECTING TRACES #
+#############################
+
+echo "[*] Obtaining traces for input files in '$IN_DIR'..."
+
+(
+
+ CUR=0
+
+ if [ "$STDIN_FILE" = "" ]; then
+
+ ls "$IN_DIR" | while read -r fn; do
+
+ CUR=$((CUR+1))
+ printf "\\r Processing file $CUR/$IN_COUNT... "
+
+ "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
+
+ done
+
+ else
+
+ ls "$IN_DIR" | while read -r fn; do
+
+ CUR=$((CUR+1))
+ printf "\\r Processing file $CUR/$IN_COUNT... "
+
+ cp "$IN_DIR/$fn" "$STDIN_FILE"
+
+ "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
+
+ done
+
+
+ fi
+
+)
+
+echo
+
+##########################
+# STEP 2: SORTING TUPLES #
+##########################
+
+# With this out of the way, we sort all tuples by popularity across all
+# datasets. The reasoning here is that we won't be able to avoid the files
+# that trigger unique tuples anyway, so we will want to start with them and
+# see what's left.
+
+echo "[*] Sorting trace sets (this may take a while)..."
+
+ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
+ sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq"
+
+TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
+
+echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
+
+#####################################
+# STEP 3: SELECTING CANDIDATE FILES #
+#####################################
+
+# The next step is to find the best candidate for each tuple. The "best"
+# part is understood simply as the smallest input that includes a particular
+# tuple in its trace. Empirical evidence suggests that this produces smaller
+# datasets than more involved algorithms that could be still pulled off in
+# a shell script.
+
+echo "[*] Finding best candidates for each tuple..."
+
+CUR=0
+
+ls -rS "$IN_DIR" | while read -r fn; do
+
+ CUR=$((CUR+1))
+ printf "\\r Processing file $CUR/$IN_COUNT... "
+
+ sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
+
+done
+
+echo
+
+##############################
+# STEP 4: LOADING CANDIDATES #
+##############################
+
+# At this point, we have a file of tuple-file pairs, sorted by file size
+# in ascending order (as a consequence of ls -rS). By doing sort keyed
+# only by tuple (-k 1,1) and configured to output only the first line for
+# every key (-s -u), we end up with the smallest file for each tuple.
+
+echo "[*] Sorting candidate list (be patient)..."
+
+sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
+ sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
+
+if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
+ echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
+ test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
+ exit 1
+fi
+
+# The sed command converted the sorted list to a shell script that populates
+# BEST_FILE[tuple]="fname". Let's load that!
+
+. "$TRACE_DIR/.candidate_script"
+
+##########################
+# STEP 5: WRITING OUTPUT #
+##########################
+
+# The final trick is to grab the top pick for each tuple, unless said tuple is
+# already set due to the inclusion of an earlier candidate; and then put all
+# tuples associated with the newly-added file to the "already have" list. The
+# loop works from least popular tuples and toward the most common ones.
+
+echo "[*] Processing candidates and writing output files..."
+
+CUR=0
+
+touch "$TRACE_DIR/.already_have"
+
+while read -r cnt tuple; do
+
+ CUR=$((CUR+1))
+ printf "\\r Processing tuple $CUR/$TUPLE_COUNT... "
+
+ # If we already have this tuple, skip it.
+
+ grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
+
+ FN=${BEST_FILE[tuple]}
+
+ $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
+
+ if [ "$((CUR % 5))" = "0" ]; then
+ sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
+ mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
+ else
+ cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
+ fi
+
+done <"$TRACE_DIR/.all_uniq"
+
+echo
+
+OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
+
+if [ "$OUT_COUNT" = "1" ]; then
+ echo "[!] WARNING: All test cases had the same traces, check syntax!"
+fi
+
+echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
+echo
+
+test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
+
+exit 0