#!/usr/bin/env bash # # american fuzzy lop++ - corpus minimization tool # --------------------------------------------- # # Originally written by Michal Zalewski # # Copyright 2014, 2015 Google Inc. All rights reserved. # # Copyright 2019-2023 AFLplusplus # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: # # https://www.apache.org/licenses/LICENSE-2.0 # # This tool tries to find the smallest subset of files in the input directory # that still trigger the full range of instrumentation data points seen in # the starting corpus. This has two uses: # # - Screening large corpora of input files before using them as a seed for # afl-fuzz. The tool will remove functionally redundant files and likely # leave you with a much smaller set. # # (In this case, you probably also want to consider running afl-tmin on # the individual files later on to reduce their size.) # # - Minimizing the corpus generated organically by afl-fuzz, perhaps when # planning to feed it to more resource-intensive tools. The tool achieves # this by removing all entries that used to trigger unique behaviors in the # past, but have been made obsolete by later finds. # # Note that the tool doesn't modify the files themselves. For that, you want # afl-tmin. # # This script must use bash because other shells may have hardcoded limits on # array sizes. # echo "corpus minimization tool for afl-fuzz" echo ######### # SETUP # ######### # Process command-line options... MEM_LIMIT=none TIMEOUT=5000 unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN F_ARG \ AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE T_ARG export AFL_QUIET=1 while getopts "+i:o:f:m:t:T:eOQUAChXY" opt; do case "$opt" in "h") ;; "i") IN_DIR="$OPTARG" ;; "o") OUT_DIR="$OPTARG" ;; "f") STDIN_FILE="$OPTARG" F_ARG=1 ;; "m") MEM_LIMIT="$OPTARG" MEM_LIMIT_GIVEN=1 ;; "t") TIMEOUT="$OPTARG" ;; "e") EXTRA_PAR="$EXTRA_PAR -e" ;; "A") export AFL_CMIN_ALLOW_ANY=1 ;; "C") export AFL_CMIN_CRASHES_ONLY=1 ;; "O") EXTRA_PAR="$EXTRA_PAR -O" FRIDA_MODE=1 ;; "Q") EXTRA_PAR="$EXTRA_PAR -Q" QEMU_MODE=1 ;; "Y") EXTRA_PAR="$EXTRA_PAR -X" NYX_MODE=1 ;; "X") EXTRA_PAR="$EXTRA_PAR -X" NYX_MODE=1 ;; "U") EXTRA_PAR="$EXTRA_PAR -U" UNICORN_MODE=1 ;; "T") T_ARG="$OPTARG" ;; "?") exit 1 ;; esac done shift $((OPTIND-1)) TARGET_BIN="$1" if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then cat 1>&2 <<_EOF_ Usage: $0 [ options ] -- /path/to/target_app [ ... ] Required parameters: -i dir - input directory with the starting corpus -o dir - output directory for minimized files Execution control settings: -T tasks - how many parallel processes to create (default=1, "all"=nproc) -f file - location read by the fuzzed program (default: stdin) -m megs - memory limit for child process (default=$MEM_LIMIT MB) -t msec - run time limit for child process (default: 5000ms) -O - use binary-only instrumentation (FRIDA mode) -Q - use binary-only instrumentation (QEMU mode) -U - use unicorn-based instrumentation (Unicorn mode) -X - use Nyx mode Minimization settings: -A - allow crashing and timeout inputs -C - keep crashing inputs, reject everything else -e - solve for edge coverage only, ignore hit counts For additional tips, please consult README.md. Environment variables used: AFL_KEEP_TRACES: leave the temporary \.traces directory AFL_NO_FORKSRV: run target via execve instead of using the forkserver AFL_PATH: last resort location to find the afl-showmap binary AFL_SKIP_BIN_CHECK: skip check for target binary AFL_CUSTOM_MUTATOR_LIBRARY: custom mutator library (post_process and send) AFL_PYTHON_MODULE: custom mutator library (post_process and send) _EOF_ exit 1 fi # Do a sanity check to discourage the use of /tmp, since we can't really # handle this safely from a shell script. #if [ "$AFL_ALLOW_TMP" = "" ]; then # # echo "$IN_DIR" | grep -qE '^(/var)?/tmp/' # T1="$?" # # echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/' # T2="$?" # # echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/' # T3="$?" # # echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/' # T4="$?" # # echo "$PWD" | grep -qE '^(/var)?/tmp/' # T5="$?" # # if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then # echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2 # exit 1 # fi # #fi # If @@ is specified, but there's no -f, let's come up with a temporary input # file name. TRACE_DIR="$OUT_DIR/.traces" if [ "$STDIN_FILE" = "" ]; then if echo "$*" | grep -qF '@@'; then STDIN_FILE="$TRACE_DIR/.cur_input" fi fi # Check for obvious errors. if [ ! "$T_ARG" = "" -a -n "$F_ARG" -a ! "$NYX_MODE" == 1 ]; then echo "[-] Error: -T and -f can not be used together." 1>&2 exit 1 fi if [ ! "$MEM_LIMIT" = "none" ]; then if [ "$MEM_LIMIT" -lt "5" ]; then echo "[-] Error: dangerously low memory limit." 1>&2 exit 1 fi fi if [ ! "$TIMEOUT" = "none" ]; then if [ "$TIMEOUT" -lt "10" ]; then echo "[-] Error: dangerously low timeout." 1>&2 exit 1 fi fi if [ "$NYX_MODE" = "" ]; then if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then TNEW="`which "$TARGET_BIN" 2>/dev/null`" if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2 exit 1 fi TARGET_BIN="$TNEW" fi fi grep -aq AFL_DUMP_MAP_SIZE "$TARGET_BIN" && { echo "[!] Trying to obtain the map size of the target ..." MAPSIZE=`AFL_DUMP_MAP_SIZE=1 "./$TARGET_BIN" 2>/dev/null` test -n "$MAPSIZE" && { export AFL_MAP_SIZE=$MAPSIZE echo "[+] Setting AFL_MAP_SIZE=$MAPSIZE" } } if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$FRIDA_MODE" = "" -a "$UNICORN_MODE" = "" -a "$NYX_MODE" = "" ]; then if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2 exit 1 fi fi if [ ! -d "$IN_DIR" ]; then echo "[-] Error: directory '$IN_DIR' not found." 1>&2 exit 1 fi test -d "$IN_DIR/default" && IN_DIR="$IN_DIR/default" test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue" find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null rm -rf "$TRACE_DIR" 2>/dev/null rmdir "$OUT_DIR" 2>/dev/null if [ -d "$OUT_DIR" ]; then echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2 exit 1 fi mkdir -m 700 -p "$TRACE_DIR" || exit 1 if [ ! "$STDIN_FILE" = "" ]; then rm -f "$STDIN_FILE" || exit 1 touch "$STDIN_FILE" || exit 1 fi SHOWMAP=`command -v afl-showmap 2>/dev/null` if [ -z "$SHOWMAP" ]; then TMP="${0%/afl-cmin.bash}/afl-showmap" if [ -x "$TMP" ]; then SHOWMAP=$TMP fi fi if [ -z "$SHOWMAP" -a -x "./afl-showmap" ]; then SHOWMAP="./afl-showmap" else if [ -n "$AFL_PATH" ]; then SHOWMAP="$AFL_PATH/afl-showmap" fi fi if [ ! -x "$SHOWMAP" ]; then echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2 rm -rf "$TRACE_DIR" exit 1 fi THREADS= if [ ! "$T_ARG" = "" ]; then if [ "$T_ARG" = "all" ]; then THREADS=$(nproc) else if [ "$T_ARG" -gt 1 -a "$T_ARG" -le "$(nproc)" ]; then THREADS=$T_ARG else echo "[-] Error: -T parameter must between 2 and $(nproc) or \"all\"." 1>&2 fi fi else if [ -z "$F_ARG" ]; then echo "[*] Are you aware of the '-T all' parallelize option that massively improves the speed?" fi fi IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`)) if [ "$IN_COUNT" = "0" ]; then echo "[-] Hmm, no inputs in the target directory. Nothing to be done." rm -rf "$TRACE_DIR" exit 1 fi echo "[*] Are you aware that afl-cmin is faster than this afl-cmin.bash script?" echo "[+] Found $IN_COUNT files for minimizing." FIRST_FILE=`ls "$IN_DIR" | head -1` # Make sure that we're not dealing with a directory. if [ -d "$IN_DIR/$FIRST_FILE" ]; then echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2 rm -rf "$TRACE_DIR" exit 1 fi # Check for the more efficient way to copy files... if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then CP_TOOL=ln else CP_TOOL=cp fi # Make sure that we can actually get anything out of afl-showmap before we # waste too much time. echo "[*] Testing the target binary..." if [ "$STDIN_FILE" = "" ]; then AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE" else cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE" AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" &2 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" exit 1 fi TMPFILE=$OUT_DIR/.list.$$ if [ ! "$THREADS" = "" ]; then ls -- "$IN_DIR" > $TMPFILE 2>/dev/null IN_COUNT=$(cat $TMPFILE | wc -l) SPLIT=$(($IN_COUNT / $THREADS)) if [ "$(($IN_COUNT % $THREADS))" -gt 0 ]; then SPLIT=$(($SPLIT + 1)) fi echo "[+] Splitting workload into $THREADS tasks with $SPLIT items on average each." split -l $SPLIT $TMPFILE $TMPFILE. fi # Let's roll! ############################# # STEP 1: COLLECTING TRACES # ############################# echo "[*] Obtaining traces for input files in '$IN_DIR'..." if [ "$THREADS" = "" ]; then ( CUR=0 if [ "$STDIN_FILE" = "" ]; then ls "$IN_DIR" | while read -r fn; do CUR=$((CUR+1)) printf "\\r Processing file $CUR/$IN_COUNT... " "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn" done else ls "$IN_DIR" | while read -r fn; do CUR=$((CUR+1)) printf "\\r Processing file $CUR/$IN_COUNT... " cp "$IN_DIR/$fn" "$STDIN_FILE" "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" "$TRACE_DIR/.all_uniq" TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`)) echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files." ##################################### # STEP 3: SELECTING CANDIDATE FILES # ##################################### # The next step is to find the best candidate for each tuple. The "best" # part is understood simply as the smallest input that includes a particular # tuple in its trace. Empirical evidence suggests that this produces smaller # datasets than more involved algorithms that could be still pulled off in # a shell script. echo "[*] Finding best candidates for each tuple..." CUR=0 ls -rS "$IN_DIR" | while read -r fn; do CUR=$((CUR+1)) printf "\\r Processing file $CUR/$IN_COUNT... " sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list" done echo ############################## # STEP 4: LOADING CANDIDATES # ############################## # At this point, we have a file of tuple-file pairs, sorted by file size # in ascending order (as a consequence of ls -rS). By doing sort keyed # only by tuple (-k 1,1) and configured to output only the first line for # every key (-s -u), we end up with the smallest file for each tuple. echo "[*] Sorting candidate list (be patient)..." sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \ sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script" if [ ! -s "$TRACE_DIR/.candidate_script" ]; then echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" exit 1 fi # The sed command converted the sorted list to a shell script that populates # BEST_FILE[tuple]="fname". Let's load that! . "$TRACE_DIR/.candidate_script" ########################## # STEP 5: WRITING OUTPUT # ########################## # The final trick is to grab the top pick for each tuple, unless said tuple is # already set due to the inclusion of an earlier candidate; and then put all # tuples associated with the newly-added file to the "already have" list. The # loop works from least popular tuples and toward the most common ones. echo "[*] Processing candidates and writing output files..." CUR=0 touch "$TRACE_DIR/.already_have" while read -r cnt tuple; do CUR=$((CUR+1)) printf "\\r Processing tuple $CUR/$TUPLE_COUNT with count $cnt... " # If we already have this tuple, skip it. grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue FN=${BEST_FILE[tuple]} # echo "tuple nr $CUR ($tuple cnt=$cnt) -> $FN" >> "$TRACE_DIR/.log" $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN" if [ "$((CUR % 5))" = "0" ]; then sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp" mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have" else cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have" fi done <"$TRACE_DIR/.all_uniq" echo OUT_COUNT=`ls -- "$OUT_DIR" | wc -l` if [ "$OUT_COUNT" = "1" ]; then echo "[!] WARNING: All test cases had the same traces, check syntax!" fi echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'." echo test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" exit 0