diff options
Diffstat (limited to 'blkiomon.c')
-rw-r--r-- | blkiomon.c | 756 |
1 files changed, 756 insertions, 0 deletions
diff --git a/blkiomon.c b/blkiomon.c new file mode 100644 index 0000000..9fc4d75 --- /dev/null +++ b/blkiomon.c @@ -0,0 +1,756 @@ +/* + * I/O monitor based on block queue trace data + * + * Copyright IBM Corp. 2008 + * + * Author(s): Martin Peschke <mp3@de.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <signal.h> +#include <getopt.h> +#include <errno.h> +#include <locale.h> +#include <libgen.h> +#include <sys/msg.h> +#include <pthread.h> +#include <time.h> + +#include "blktrace.h" +#include "rbtree.h" +#include "jhash.h" +#include "blkiomon.h" + +struct trace { + struct blk_io_trace bit; + struct rb_node node; + struct trace *next; + long sequence; +}; + +struct rb_search { + struct rb_node **node_ptr; + struct rb_node *parent; +}; + +struct dstat_msg { + long mtype; + struct blkiomon_stat stat; +}; + +struct dstat { + struct dstat_msg msg; + struct rb_node node; + struct dstat *next; +}; + +struct output { + char *fn; + FILE *fp; + char *buf; + int pipe; +}; + +static char blkiomon_version[] = "0.3"; + +static FILE *ifp; +static int interval = -1; + +static struct trace *vacant_traces_list = NULL; +static int vacant_traces = 0; + +#define TRACE_HASH_SIZE 128 +struct trace *thash[TRACE_HASH_SIZE] = {}; + +static struct dstat *vacant_dstats_list = NULL; +static struct rb_root dstat_tree[2] = { RB_ROOT, RB_ROOT }; +static struct dstat *dstat_list[2] = {}; +static int dstat_curr = 0; + +static struct output drvdata, human, binary, debug; + +static char *msg_q_name = NULL; +static int msg_q_id = -1, msg_q = -1; +static long msg_id = -1; + +static pthread_t interval_thread; +static pthread_mutex_t dstat_mutex = PTHREAD_MUTEX_INITIALIZER; + +int data_is_native = -1; + +static int up = 1; + +/* debugging */ +static long leftover = 0, driverdata = 0, match = 0, mismatch = 0, sequence = 0; + +static void dump_bit(struct trace *t, const char *descr) +{ + struct blk_io_trace *bit = &t->bit; + + if (!debug.fn) + return; + + fprintf(debug.fp, "--- %s ---\n", descr); + fprintf(debug.fp, "magic %16d\n", bit->magic); + fprintf(debug.fp, "sequence %16d\n", bit->sequence); + fprintf(debug.fp, "time %16ld\n", (unsigned long)bit->time); + fprintf(debug.fp, "sector %16ld\n", (unsigned long)bit->sector); + fprintf(debug.fp, "bytes %16d\n", bit->bytes); + fprintf(debug.fp, "action %16x\n", bit->action); + fprintf(debug.fp, "pid %16d\n", bit->pid); + fprintf(debug.fp, "device %16d\n", bit->device); + fprintf(debug.fp, "cpu %16d\n", bit->cpu); + fprintf(debug.fp, "error %16d\n", bit->error); + fprintf(debug.fp, "pdu_len %16d\n", bit->pdu_len); + + fprintf(debug.fp, "order %16ld\n", t->sequence); +} + +static void dump_bits(struct trace *t1, struct trace *t2, const char *descr) +{ + struct blk_io_trace *bit1 = &t1->bit; + struct blk_io_trace *bit2 = &t2->bit; + + if (!debug.fn) + return; + + fprintf(debug.fp, "--- %s ---\n", descr); + fprintf(debug.fp, "magic %16d %16d\n", bit1->magic, bit2->magic); + fprintf(debug.fp, "sequence %16d %16d\n", + bit1->sequence, bit2->sequence); + fprintf(debug.fp, "time %16ld %16ld\n", + (unsigned long)bit1->time, (unsigned long)bit2->time); + fprintf(debug.fp, "sector %16ld %16ld\n", + (unsigned long)bit1->sector, (unsigned long)bit2->sector); + fprintf(debug.fp, "bytes %16d %16d\n", bit1->bytes, bit2->bytes); + fprintf(debug.fp, "action %16x %16x\n", bit1->action, bit2->action); + fprintf(debug.fp, "pid %16d %16d\n", bit1->pid, bit2->pid); + fprintf(debug.fp, "device %16d %16d\n", bit1->device, bit2->device); + fprintf(debug.fp, "cpu %16d %16d\n", bit1->cpu, bit2->cpu); + fprintf(debug.fp, "error %16d %16d\n", bit1->error, bit2->error); + fprintf(debug.fp, "pdu_len %16d %16d\n", bit1->pdu_len, bit2->pdu_len); + + fprintf(debug.fp, "order %16ld %16ld\n", t1->sequence, t2->sequence); +} + +static struct dstat *blkiomon_alloc_dstat(void) +{ + struct dstat *dstat; + + if (vacant_dstats_list) { + dstat = vacant_dstats_list; + vacant_dstats_list = dstat->next; + } else + dstat = malloc(sizeof(*dstat)); + if (!dstat) { + fprintf(stderr, + "blkiomon: could not allocate device statistic"); + return NULL; + } + + blkiomon_stat_init(&dstat->msg.stat); + return dstat; +} + +static struct dstat *blkiomon_find_dstat(struct rb_search *search, __u32 device) +{ + struct rb_node **p = &(dstat_tree[dstat_curr].rb_node); + struct rb_node *parent = NULL; + struct dstat *dstat; + + while (*p) { + parent = *p; + + dstat = rb_entry(parent, struct dstat, node); + + if (dstat->msg.stat.device < device) + p = &(*p)->rb_left; + else if (dstat->msg.stat.device > device) + p = &(*p)->rb_right; + else + return dstat; + } + search->node_ptr = p; + search->parent = parent; + return NULL; +} + +static struct dstat *blkiomon_get_dstat(__u32 device) +{ + struct dstat *dstat; + struct rb_search search; + + pthread_mutex_lock(&dstat_mutex); + + dstat = blkiomon_find_dstat(&search, device); + if (dstat) + goto out; + + dstat = blkiomon_alloc_dstat(); + if (!dstat) + goto out; + + dstat->msg.stat.device = device; + + rb_link_node(&dstat->node, search.parent, search.node_ptr); + rb_insert_color(&dstat->node, &dstat_tree[dstat_curr]); + + dstat->next = dstat_list[dstat_curr]; + dstat_list[dstat_curr] = dstat; + +out: + pthread_mutex_unlock(&dstat_mutex); + return dstat; +} + +static int blkiomon_output_msg_q(struct dstat *dstat) +{ + if (!msg_q_name) + return 0; + + dstat->msg.mtype = msg_id; + return msgsnd(msg_q, &dstat->msg, sizeof(struct blkiomon_stat), 0); +} + +static int blkiomon_output_binary(struct dstat *dstat) +{ + struct blkiomon_stat *p = &dstat->msg.stat; + + if (!binary.fn) + return 0; + + if (fwrite(p, sizeof(*p), 1, binary.fp) != 1) + goto failed; + if (binary.pipe && fflush(binary.fp)) + goto failed; + return 0; + +failed: + fprintf(stderr, "blkiomon: could not write to %s\n", binary.fn); + fclose(binary.fp); + binary.fn = NULL; + return 1; +} + +static struct dstat *blkiomon_output(struct dstat *head, struct timespec *ts) +{ + struct dstat *dstat, *tail = NULL; + + for (dstat = head; dstat; dstat = dstat->next) { + dstat->msg.stat.time = ts->tv_sec; + blkiomon_stat_print(human.fp, &dstat->msg.stat); + blkiomon_stat_to_be(&dstat->msg.stat); + blkiomon_output_binary(dstat); + blkiomon_output_msg_q(dstat); + tail = dstat; + } + return tail; +} + +static void *blkiomon_interval(void *data) +{ + struct timespec wake, r; + struct dstat *head, *tail; + int finished; + + clock_gettime(CLOCK_REALTIME, &wake); + + while (1) { + wake.tv_sec += interval; + if (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &wake, &r)) { + fprintf(stderr, "blkiomon: interrupted sleep"); + continue; + } + + /* grab tree and make data gatherer build up another tree */ + pthread_mutex_lock(&dstat_mutex); + finished = dstat_curr; + dstat_curr = dstat_curr ? 0 : 1; + pthread_mutex_unlock(&dstat_mutex); + + head = dstat_list[finished]; + if (!head) + continue; + dstat_list[finished] = NULL; + dstat_tree[finished] = RB_ROOT; + tail = blkiomon_output(head, &wake); + + pthread_mutex_lock(&dstat_mutex); + tail->next = vacant_dstats_list; + vacant_dstats_list = head; + pthread_mutex_unlock(&dstat_mutex); + } + return data; +} + +#define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE)) + +static int blkiomon_account(struct blk_io_trace *bit_d, + struct blk_io_trace *bit_c) +{ + struct dstat *dstat; + struct blkiomon_stat *p; + __u64 d2c = (bit_c->time - bit_d->time) / 1000; /* ns -> us */ + __u32 size = bit_d->bytes; + __u64 thrput = size * 1000 / d2c; + + dstat = blkiomon_get_dstat(bit_d->device); + if (!dstat) + return 1; + p = &dstat->msg.stat; + + if (BLK_DATADIR(bit_c->action) & BLK_TC_READ) { + minmax_account(&p->thrput_r, thrput); + minmax_account(&p->size_r, size); + minmax_account(&p->d2c_r, d2c); + } else if (BLK_DATADIR(bit_c->action) & BLK_TC_WRITE) { + minmax_account(&p->thrput_w, thrput); + minmax_account(&p->size_w, size); + minmax_account(&p->d2c_w, d2c); + } else + p->bidir++; + + histlog2_account(p->size_hist, size, &size_hist); + histlog2_account(p->d2c_hist, d2c, &d2c_hist); + return 0; +} + +static struct trace *blkiomon_alloc_trace(void) +{ + struct trace *t = vacant_traces_list; + if (t) { + vacant_traces_list = t->next; + vacant_traces--; + } else + t = malloc(sizeof(*t)); + memset(t, 0, sizeof(*t)); + return t; +} + +static void blkiomon_free_trace(struct trace *t) +{ + if (vacant_traces < 256) { + t->next = vacant_traces_list; + vacant_traces_list = t; + vacant_traces++; + } else + free(t); +} + +static int action(int a) +{ + int bits = BLK_TC_WRITE | BLK_TC_READ | BLK_TC_FS | BLK_TC_PC; + return a & (BLK_TC_ACT(bits)); +} + +static void blkiomon_store_trace(struct trace *t) +{ + int i = t->bit.sector % TRACE_HASH_SIZE; + + t->next = thash[i]; + thash[i] = t; +} + +static struct trace *blkiomon_fetch_trace(struct blk_io_trace *bit) +{ + int i = bit->sector % TRACE_HASH_SIZE; + struct trace *t, *prev = NULL; + + for (t = thash[i]; t; t = t->next) { + if (t->bit.device == bit->device && + t->bit.sector == bit->sector && + action(t->bit.action) == action(bit->action)) { + if (prev) + prev->next = t->next; + else + thash[i] = t->next; + return t; + } + prev = t; + } + return NULL; +} + +static struct trace *blkiomon_do_trace(struct trace *t) +{ + struct trace *t_stored, *t_old, *t_young; + + /* store trace if there is no match yet */ + t_stored = blkiomon_fetch_trace(&t->bit); + if (!t_stored) { + blkiomon_store_trace(t); + return blkiomon_alloc_trace(); + } + + /* figure out older trace and younger trace */ + if (t_stored->bit.time < t->bit.time) { + t_old = t_stored; + t_young = t; + } else { + t_old = t; + t_young = t_stored; + } + + /* we need an older D trace and a younger C trace */ + if (t_old->bit.action & BLK_TC_ACT(BLK_TC_ISSUE) && + t_young->bit.action & BLK_TC_ACT(BLK_TC_COMPLETE)) { + /* matching D and C traces - update statistics */ + match++; + blkiomon_account(&t_old->bit, &t_young->bit); + blkiomon_free_trace(t_stored); + return t; + } + + /* no matching D and C traces - keep more recent trace */ + dump_bits(t_old, t_young, "mismatch"); + mismatch++; + blkiomon_store_trace(t_young); + return t_old; +} + +static int blkiomon_dump_drvdata(struct blk_io_trace *bit, void *pdu_buf) +{ + if (!drvdata.fn) + return 0; + + if (fwrite(bit, sizeof(*bit), 1, drvdata.fp) != 1) + goto failed; + if (fwrite(pdu_buf, bit->pdu_len, 1, drvdata.fp) != 1) + goto failed; + if (drvdata.pipe && fflush(drvdata.fp)) + goto failed; + return 0; + +failed: + fprintf(stderr, "blkiomon: could not write to %s\n", drvdata.fn); + fclose(drvdata.fp); + drvdata.fn = NULL; + return 1; +} + +static int blkiomon_do_fifo(void) +{ + struct trace *t; + struct blk_io_trace *bit; + void *pdu_buf = NULL; + + t = blkiomon_alloc_trace(); + if (!t) + return 1; + bit = &t->bit; + + while (up) { + if (fread(bit, sizeof(*bit), 1, ifp) != 1) { + if (!feof(ifp)) + fprintf(stderr, + "blkiomon: could not read trace"); + break; + } + if (ferror(ifp)) { + clearerr(ifp); + fprintf(stderr, "blkiomon: error while reading trace"); + break; + } + + if (data_is_native == -1 && check_data_endianness(bit->magic)) { + fprintf(stderr, "blkiomon: endianess problem\n"); + break; + } + + /* endianess */ + trace_to_cpu(bit); + if (verify_trace(bit)) { + fprintf(stderr, "blkiomon: bad trace\n"); + break; + } + + /* read additional trace payload */ + if (bit->pdu_len) { + pdu_buf = realloc(pdu_buf, bit->pdu_len); + if (fread(pdu_buf, bit->pdu_len, 1, ifp) != 1) { + clearerr(ifp); + fprintf(stderr, "blkiomon: could not read payload\n"); + break; + } + } + + t->sequence = sequence++; + + /* forward low-level device driver trace to other tool */ + if (bit->action & BLK_TC_ACT(BLK_TC_DRV_DATA)) { + driverdata++; + if (blkiomon_dump_drvdata(bit, pdu_buf)) { + fprintf(stderr, "blkiomon: could not send trace\n"); + break; + } + continue; + } + + if (!(bit->action & BLK_TC_ACT(BLK_TC_ISSUE | BLK_TC_COMPLETE))) + continue; + + /* try to find matching trace and update statistics */ + t = blkiomon_do_trace(t); + if (!t) { + fprintf(stderr, "blkiomon: could not alloc trace\n"); + break; + } + bit = &t->bit; + /* t and bit will be recycled for next incoming trace */ + } + blkiomon_free_trace(t); + free(pdu_buf); + return 0; +} + +static int blkiomon_open_output(struct output *out) +{ + int mode, vbuf_size; + + if (!out->fn) + return 0; + + if (!strcmp(out->fn, "-")) { + out->fp = fdopen(STDOUT_FILENO, "w"); + mode = _IOLBF; + vbuf_size = 4096; + out->pipe = 1; + } else { + out->fp = fopen(out->fn, "w"); + mode = _IOFBF; + vbuf_size = 128 * 1024; + out->pipe = 0; + } + if (!out->fp) + goto failed; + out->buf = malloc(128 * 1024); + if (setvbuf(out->fp, out->buf, mode, vbuf_size)) + goto failed; + return 0; + +failed: + fprintf(stderr, "blkiomon: could not write to %s\n", out->fn); + out->fn = NULL; + free(out->buf); + return 1; +} + +static int blkiomon_open_msg_q(void) +{ + key_t key; + + if (!msg_q_name) + return 0; + if (!msg_q_id || msg_id <= 0) + return 1; + key = ftok(msg_q_name, msg_q_id); + if (key == -1) + return 1; + while (up) { + msg_q = msgget(key, S_IRWXU); + if (msg_q >= 0) + break; + } + return (msg_q >= 0 ? 0 : -1); +} + +static void blkiomon_debug(void) +{ + int i; + struct trace *t; + + if (!debug.fn) + return; + + for (i = 0; i < TRACE_HASH_SIZE; i++) + for (t = thash[i]; t; t = t->next) { + dump_bit(t, "leftover"); + leftover++; + } + + fprintf(debug.fp, "%ld leftover, %ld match, %ld mismatch, " + "%ld driverdata, %ld overall\n", + leftover, match, mismatch, driverdata, sequence); +} + +#define S_OPTS "b:d:D:h:I:Q:q:m:V" + +static char usage_str[] = "\n\nblkiomon " \ + "-I <interval> | --interval=<interval>\n" \ + "[ -h <file> | --human-readable=<file> ]\n" \ + "[ -b <file> | --binary=<file> ]\n" \ + "[ -D <file> | --debug=<file> ]\n" \ + "[ -Q <path name> | --msg-queue=<path name>]\n" \ + "[ -q <msg queue id> | --msg-queue-id=<msg queue id>]\n" \ + "[ -m <msg id> | --msg-id=<msg id>]\n" \ + "[ -V | --version ]\n\n" \ + "\t-I Sample interval.\n" \ + "\t-h Human-readable output file.\n" \ + "\t-b Binary output file.\n" \ + "\t-d Output file for data emitted by low level device driver.\n" \ + "\t-D Output file for debugging data.\n" \ + "\t-Qqm Output to message queue using given ID for messages.\n" \ + "\t-V Print program version.\n\n"; + +static struct option l_opts[] = { + { + .name = "human-readable", + .has_arg = required_argument, + .flag = NULL, + .val = 'h' + }, + { + .name = "binary", + .has_arg = required_argument, + .flag = NULL, + .val = 'b' + }, + { + .name = "dump-lldd", + .has_arg = required_argument, + .flag = NULL, + .val = 'd' + }, + { + .name = "debug", + .has_arg = required_argument, + .flag = NULL, + .val = 'D' + }, + { + .name = "interval", + .has_arg = required_argument, + .flag = NULL, + .val = 'I' + }, + { + .name = "msg-queue", + .has_arg = required_argument, + .flag = NULL, + .val = 'Q' + }, + { + .name = "msg-queue-id", + .has_arg = required_argument, + .flag = NULL, + .val = 'q' + }, + { + .name = "msg-id", + .has_arg = required_argument, + .flag = NULL, + .val = 'm' + }, + { + .name = "version", + .has_arg = no_argument, + .flag = NULL, + .val = 'V' + }, + { + .name = NULL, + } +}; + +static void blkiomon_signal(int signal) +{ + fprintf(stderr, "blkiomon: terminated by signal\n"); + up = signal & 0; +} + +int main(int argc, char *argv[]) +{ + int c; + + signal(SIGALRM, blkiomon_signal); + signal(SIGINT, blkiomon_signal); + signal(SIGTERM, blkiomon_signal); + signal(SIGQUIT, blkiomon_signal); + + while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) { + switch (c) { + case 'h': + human.fn = optarg; + break; + case 'b': + binary.fn = optarg; + break; + case 'd': + drvdata.fn = optarg; + break; + case 'D': + debug.fn = optarg; + break; + case 'I': + interval = atoi(optarg); + break; + case 'Q': + msg_q_name = optarg; + break; + case 'q': + msg_q_id = atoi(optarg); + break; + case 'm': + msg_id = atoi(optarg); + break; + case 'V': + printf("%s version %s\n", argv[0], blkiomon_version); + return 0; + default: + fprintf(stderr, "Usage: %s", usage_str); + return 1; + } + } + + if (interval <= 0) { + fprintf(stderr, "Usage: %s", usage_str); + return 1; + } + + ifp = fdopen(STDIN_FILENO, "r"); + if (!ifp) { + perror("blkiomon: could not open stdin for reading"); + return 1; + } + + if (blkiomon_open_output(&human)) + return 1; + if (blkiomon_open_output(&binary)) + return 1; + if (blkiomon_open_output(&drvdata)) + return 1; + if (blkiomon_open_output(&debug)) + return 1; + if (blkiomon_open_msg_q()) + return 1; + + if (pthread_create(&interval_thread, NULL, blkiomon_interval, NULL)) { + fprintf(stderr, "blkiomon: could not create thread"); + return 1; + } + + blkiomon_do_fifo(); + + blkiomon_debug(); + return 0; +} |