final step: rename afl-cmin to afl-cmin.bash and add a wrapper afl-cmin for afl-cmin.awk

author: hexcoder- <heiko@hexco.de> 2020-01-22 19:07:02 +0100
committer: hexcoder- <heiko@hexco.de> 2020-01-22 19:07:02 +0100
commit: ce0b9dae5971f22cd0ae0b468322f78ee2a8a766 (patch)
tree: 135217f2c9b27dfff94e4de7f5d2c5494d566fb6 /afl-cmin.bash
parent: 7ce627c92e9b0536e254422d5ef604c3f58e43ce (diff)
download: AFLplusplus-ce0b9dae5971f22cd0ae0b468322f78ee2a8a766.tar.gz
1 files changed, 470 insertions, 0 deletions
diff --git a/afl-cmin.bash b/afl-cmin.bash
new file mode 100755
index 00000000..1dd782d8
--- /dev/null
+++ b/afl-cmin.bash
@@ -0,0 +1,470 @@
+#!/usr/bin/env bash
+#
+# american fuzzy lop++ - corpus minimization tool
+# ---------------------------------------------
+#
+# Originally written by Michal Zalewski
+#
+# Copyright 2014, 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# This tool tries to find the smallest subset of files in the input directory
+# that still trigger the full range of instrumentation data points seen in
+# the starting corpus. This has two uses:
+#
+#   - Screening large corpora of input files before using them as a seed for
+#     afl-fuzz. The tool will remove functionally redundant files and likely
+#     leave you with a much smaller set.
+#
+#     (In this case, you probably also want to consider running afl-tmin on
+#     the individual files later on to reduce their size.)
+#
+#   - Minimizing the corpus generated organically by afl-fuzz, perhaps when
+#     planning to feed it to more resource-intensive tools. The tool achieves
+#     this by removing all entries that used to trigger unique behaviors in the
+#     past, but have been made obsolete by later finds.
+#
+# Note that the tool doesn't modify the files themselves. For that, you want
+# afl-tmin.
+#
+# This script must use bash because other shells may have hardcoded limits on
+# array sizes.
+#
+
+echo "corpus minimization tool for afl-fuzz by Michal Zalewski"
+echo
+
+#########
+# SETUP #
+#########
+
+# Process command-line options...
+
+MEM_LIMIT=200
+TIMEOUT=none
+
+unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \
+  AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE
+
+while getopts "+i:o:f:m:t:eQUCh" opt; do
+
+  case "$opt" in 
+
+    "h")
+	;;
+
+    "i")
+         IN_DIR="$OPTARG"
+         ;;
+
+    "o")
+         OUT_DIR="$OPTARG"
+         ;;
+    "f")
+         STDIN_FILE="$OPTARG"
+         ;;
+    "m")
+         MEM_LIMIT="$OPTARG"
+         MEM_LIMIT_GIVEN=1
+         ;;
+    "t")
+         TIMEOUT="$OPTARG"
+         ;;
+    "e")
+         EXTRA_PAR="$EXTRA_PAR -e"
+         ;;
+    "C")
+         export AFL_CMIN_CRASHES_ONLY=1
+         ;;
+    "Q")
+         EXTRA_PAR="$EXTRA_PAR -Q"
+         test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
+         QEMU_MODE=1
+         ;;
+    "U")
+         EXTRA_PAR="$EXTRA_PAR -U"
+         test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
+         UNICORN_MODE=1
+         ;;    
+    "?")
+         exit 1
+         ;;
+
+   esac
+
+done
+
+shift $((OPTIND-1))
+
+TARGET_BIN="$1"
+
+if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
+
+  cat 1>&2 <<_EOF_
+Usage: $0 [ options ] -- /path/to/target_app [ ... ]
+
+Required parameters:
+
+  -i dir        - input directory with the starting corpus
+  -o dir        - output directory for minimized files
+
+Execution control settings:
+
+  -f file       - location read by the fuzzed program (stdin)
+  -m megs       - memory limit for child process ($MEM_LIMIT MB)
+  -t msec       - run time limit for child process (none)
+  -Q            - use binary-only instrumentation (QEMU mode)
+  -U            - use unicorn-based instrumentation (Unicorn mode)
+  
+Minimization settings:
+
+  -C            - keep crashing inputs, reject everything else
+  -e            - solve for edge coverage only, ignore hit counts
+
+For additional tips, please consult docs/README.
+
+_EOF_
+  exit 1
+fi
+
+# Do a sanity check to discourage the use of /tmp, since we can't really
+# handle this safely from a shell script.
+
+if [ "$AFL_ALLOW_TMP" = "" ]; then
+
+  echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
+  T1="$?"
+
+  echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
+  T2="$?"
+
+  echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
+  T3="$?"
+
+  echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
+  T4="$?"
+
+  echo "$PWD" | grep -qE '^(/var)?/tmp/'
+  T5="$?"
+
+  if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
+    echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2
+    exit 1
+  fi
+
+fi
+
+# If @@ is specified, but there's no -f, let's come up with a temporary input
+# file name.
+
+TRACE_DIR="$OUT_DIR/.traces"
+
+if [ "$STDIN_FILE" = "" ]; then
+
+  if echo "$*" | grep -qF '@@'; then
+    STDIN_FILE="$TRACE_DIR/.cur_input"
+  fi
+
+fi
+
+# Check for obvious errors.
+
+if [ ! "$MEM_LIMIT" = "none" ]; then
+
+  if [ "$MEM_LIMIT" -lt "5" ]; then
+    echo "[-] Error: dangerously low memory limit." 1>&2
+    exit 1
+  fi
+
+fi
+
+if [ ! "$TIMEOUT" = "none" ]; then
+
+  if [ "$TIMEOUT" -lt "10" ]; then
+    echo "[-] Error: dangerously low timeout." 1>&2
+    exit 1
+  fi
+
+fi
+
+if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
+
+  TNEW="`which "$TARGET_BIN" 2>/dev/null`"
+
+  if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
+    echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
+    exit 1
+  fi
+
+  TARGET_BIN="$TNEW"
+
+fi
+
+if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$UNICORN_MODE" = "" ]; then
+
+  if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
+    echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
+    exit 1
+  fi
+
+fi
+
+if [ ! -d "$IN_DIR" ]; then
+  echo "[-] Error: directory '$IN_DIR' not found." 1>&2
+  exit 1
+fi
+
+test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
+
+find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
+rm -rf "$TRACE_DIR" 2>/dev/null
+
+rmdir "$OUT_DIR" 2>/dev/null
+
+if [ -d "$OUT_DIR" ]; then
+  echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
+  exit 1
+fi
+
+mkdir -m 700 -p "$TRACE_DIR" || exit 1
+
+if [ ! "$STDIN_FILE" = "" ]; then
+  rm -f "$STDIN_FILE" || exit 1
+  touch "$STDIN_FILE" || exit 1
+fi
+
+if [ "$AFL_PATH" = "" ]; then
+  SHOWMAP="${0%/afl-cmin}/afl-showmap"
+else
+  SHOWMAP="$AFL_PATH/afl-showmap"
+fi
+
+if [ ! -x "$SHOWMAP" ]; then
+  echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
+  rm -rf "$TRACE_DIR"
+  exit 1
+fi
+
+IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
+
+if [ "$IN_COUNT" = "0" ]; then
+  echo "[+] Hmm, no inputs in the target directory. Nothing to be done."
+  rm -rf "$TRACE_DIR"
+  exit 1
+fi
+
+FIRST_FILE=`ls "$IN_DIR" | head -1`
+
+# Make sure that we're not dealing with a directory.
+
+if [ -d "$IN_DIR/$FIRST_FILE" ]; then
+  echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
+  rm -rf "$TRACE_DIR"
+  exit 1
+fi
+
+# Check for the more efficient way to copy files...
+
+if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
+  CP_TOOL=ln
+else
+  CP_TOOL=cp
+fi
+
+# Make sure that we can actually get anything out of afl-showmap before we
+# waste too much time.
+
+echo "[*] Testing the target binary..."
+
+if [ "$STDIN_FILE" = "" ]; then
+
+  AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
+
+else
+
+  cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
+  AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
+
+fi
+
+FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
+
+if [ "$FIRST_COUNT" -gt "0" ]; then
+
+  echo "[+] OK, $FIRST_COUNT tuples recorded."
+
+else
+
+  echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
+  test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
+  exit 1
+
+fi
+
+# Let's roll!
+
+#############################
+# STEP 1: COLLECTING TRACES #
+#############################
+
+echo "[*] Obtaining traces for input files in '$IN_DIR'..."
+
+(
+
+  CUR=0
+
+  if [ "$STDIN_FILE" = "" ]; then
+
+    ls "$IN_DIR" | while read -r fn; do
+
+      CUR=$((CUR+1))
+      printf "\\r    Processing file $CUR/$IN_COUNT... "
+
+      "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
+
+    done
+
+  else
+
+    ls "$IN_DIR" | while read -r fn; do
+
+      CUR=$((CUR+1))
+      printf "\\r    Processing file $CUR/$IN_COUNT... "
+
+      cp "$IN_DIR/$fn" "$STDIN_FILE"
+
+      "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
+
+    done
+
+
+  fi
+
+)
+
+echo
+
+##########################
+# STEP 2: SORTING TUPLES #
+##########################
+
+# With this out of the way, we sort all tuples by popularity across all
+# datasets. The reasoning here is that we won't be able to avoid the files
+# that trigger unique tuples anyway, so we will want to start with them and
+# see what's left.
+
+echo "[*] Sorting trace sets (this may take a while)..."
+
+ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
+  sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq"
+
+TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
+
+echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
+
+#####################################
+# STEP 3: SELECTING CANDIDATE FILES #
+#####################################
+
+# The next step is to find the best candidate for each tuple. The "best"
+# part is understood simply as the smallest input that includes a particular
+# tuple in its trace. Empirical evidence suggests that this produces smaller
+# datasets than more involved algorithms that could be still pulled off in
+# a shell script.
+
+echo "[*] Finding best candidates for each tuple..."
+
+CUR=0
+
+ls -rS "$IN_DIR" | while read -r fn; do
+
+  CUR=$((CUR+1))
+  printf "\\r    Processing file $CUR/$IN_COUNT... "
+
+  sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
+
+done
+
+echo
+
+##############################
+# STEP 4: LOADING CANDIDATES #
+##############################
+
+# At this point, we have a file of tuple-file pairs, sorted by file size
+# in ascending order (as a consequence of ls -rS). By doing sort keyed
+# only by tuple (-k 1,1) and configured to output only the first line for
+# every key (-s -u), we end up with the smallest file for each tuple.
+
+echo "[*] Sorting candidate list (be patient)..."
+
+sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
+  sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
+
+if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
+  echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
+  test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
+  exit 1
+fi
+
+# The sed command converted the sorted list to a shell script that populates
+# BEST_FILE[tuple]="fname". Let's load that!
+
+. "$TRACE_DIR/.candidate_script"
+
+##########################
+# STEP 5: WRITING OUTPUT #
+##########################
+
+# The final trick is to grab the top pick for each tuple, unless said tuple is
+# already set due to the inclusion of an earlier candidate; and then put all
+# tuples associated with the newly-added file to the "already have" list. The
+# loop works from least popular tuples and toward the most common ones.
+
+echo "[*] Processing candidates and writing output files..."
+
+CUR=0
+
+touch "$TRACE_DIR/.already_have"
+
+while read -r cnt tuple; do
+
+  CUR=$((CUR+1))
+  printf "\\r    Processing tuple $CUR/$TUPLE_COUNT... "
+
+  # If we already have this tuple, skip it.
+
+  grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
+
+  FN=${BEST_FILE[tuple]}
+
+  $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
+
+  if [ "$((CUR % 5))" = "0" ]; then
+    sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
+    mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
+  else
+    cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
+  fi
+
+done <"$TRACE_DIR/.all_uniq"
+
+echo
+
+OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
+
+if [ "$OUT_COUNT" = "1" ]; then
+  echo "[!] WARNING: All test cases had the same traces, check syntax!"
+fi
+
+echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
+echo
+
+test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
+
+exit 0
author	hexcoder- <heiko@hexco.de>	2020-01-22 19:07:02 +0100
committer	hexcoder- <heiko@hexco.de>	2020-01-22 19:07:02 +0100
commit	ce0b9dae5971f22cd0ae0b468322f78ee2a8a766 (patch)
tree	135217f2c9b27dfff94e4de7f5d2c5494d566fb6 /afl-cmin.bash
parent	7ce627c92e9b0536e254422d5ef604c3f58e43ce (diff)
download	AFLplusplus-ce0b9dae5971f22cd0ae0b468322f78ee2a8a766.tar.gz