diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2023-11-20 22:04:23 +0200 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2023-11-20 22:04:23 +0200 |
commit | 345f907c404ff05165834601009835a42c90463d (patch) | |
tree | 94a291fd4eb6f1cdd327a4ca0492b43e33aab3e3 | |
parent | 12793c04b28c84e0af830c81ea51c90ab65eb585 (diff) | |
parent | ad4249ec70c04690a797a8d8310b1d4ee69a357b (diff) | |
download | one-true-awk-345f907c404ff05165834601009835a42c90463d.tar.gz |
Merge branch 'master' into improve-gototab
-rw-r--r-- | FIXES | 8 | ||||
-rw-r--r-- | b.c | 148 | ||||
-rw-r--r-- | main.c | 2 |
3 files changed, 64 insertions, 94 deletions
@@ -25,10 +25,16 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the second edition of the AWK book was published in September 2023. +Nov 20, 2023 + rewrite of fnematch to fix a number of issues, including + extraneous output, out-of-bounds access, number of bytes + to push back after a failed match etc. + thanks to Miguel Pineiro Jr. + Nov 15, 2023 Man page edit, regression test fixes. thanks to Arnold Robbins consolidation of sub and gsub into dosub, removing duplicate - code. thanks to Miguel Pineiro Jr. + code. thanks to Miguel Pineiro Jr. gcc replaced with cc everywhere. Oct 30, 2023: @@ -827,59 +827,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */ #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long -// Read one rune at a time from the given FILE*. Return both -// the bytes and the actual rune. - -struct runedata { - int rune; - size_t len; - char bytes[6]; -}; - -struct runedata getrune(FILE *fp) -{ - struct runedata result; - int c, next; - - memset(&result, 0, sizeof(result)); - - c = getc(fp); - if (c == EOF) - return result; // result.rune == 0 --> EOF - else if (c < 128 || awk_mb_cur_max == 1) { - result.bytes[0] = c; - result.len = 1; - result.rune = c; - - return result; - } - - // need to get bytes and fill things in - result.bytes[0] = c; - result.len = 1; - - next = 1; - for (int i = 1; i < MAX_UTF_BYTES; i++) { - c = getc(fp); - if (c == EOF) - break; - result.bytes[next++] = c; - result.len++; - } - - // put back any extra input bytes - int actual_len = u8_nextlen(result.bytes); - while (result.len > actual_len) { - ungetc(result.bytes[--result.len], fp); - } - - result.bytes[result.len] = '\0'; - (void) u8_rune(& result.rune, (uschar *) result.bytes); - - return result; -} - - /* * NAME * fnematch @@ -897,58 +844,76 @@ struct runedata getrune(FILE *fp) bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) { - char *buf = *pbuf; + char *i, *j, *k, *buf = *pbuf; int bufsize = *pbufsize; - int i, j, k, ns, s; - struct runedata r; + int c, n, ns, s; s = pfa->initstat; patlen = 0; /* - * All indices relative to buf. - * i <= j <= k <= bufsize + * buf <= i <= j <= k <= buf+bufsize * - * i: origin of active substring (first byte of first character) - * j: current character (last byte of current character) - * k: destination of next getc() + * i: origin of active substring + * j: current character + * k: destination of the next getc */ - i = -1, k = 0; - do { - j = i++; - do { - r = getrune(f); - if ((++j + r.len) >= k) { - if (k >= bufsize) - if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) - FATAL("stream '%.30s...' too long", buf); - } - memcpy(buf + k, r.bytes, r.len); - j += r.len - 1; // incremented next time around the loop - k += r.len; - if ((ns = get_gototab(pfa, s, r.rune)) != 0) - s = ns; - else - s = cgoto(pfa, s, r.rune); + i = j = k = buf; - if (pfa->out[s]) { /* final state */ - patlen = j - i + 1; - if (r.rune == 0) /* don't count $ */ - patlen--; + do { + /* + * Call u8_rune with at least MAX_UTF_BYTES ahead in + * the buffer until EOF interferes. + */ + if (k - j < MAX_UTF_BYTES) { + if (k + MAX_UTF_BYTES > buf + bufsize) { + adjbuf((char **) &buf, &bufsize, + bufsize + MAX_UTF_BYTES, + quantum, 0, "fnematch"); + } + for (n = MAX_UTF_BYTES ; n > 0; n--) { + *k++ = (c = getc(f)) != EOF ? c : 0; + if (c == EOF) { + if (ferror(f)) + FATAL("fnematch: getc error"); + break; + } } - } while (buf[j] && s != 1); + } + + j += u8_rune(&c, (uschar *)j); + + if ((ns = get_gototab(pfa, s, c)) != 0) + s = ns; + else + s = cgoto(pfa, s, c); + + if (pfa->out[s]) { /* final state */ + patbeg = i; + patlen = j - i; + if (c == 0) /* don't count $ */ + patlen--; + } + + if (c && s != 1) + continue; /* origin i still viable, next j */ + if (patlen) + break; /* best match found */ + + /* no match at origin i, next i and start over */ + i += u8_rune(&c, (uschar *)i); + if (c == 0) + break; /* no match */ + j = i; s = 2; - if (r.len > 1) - i += r.len - 1; // i incremented around the loop - } while (buf[i] && !patlen); + } while (1); /* adjbuf() may have relocated a resized buffer. Inform the world. */ *pbuf = buf; *pbufsize = bufsize; if (patlen) { - patbeg = (char *) buf + i; /* * Under no circumstances is the last character fed to * the automaton part of the match. It is EOF's nullbyte, @@ -961,11 +926,10 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) * terminate the buffer. */ do - for (int ii = r.len; ii > 0; ii--) - if (buf[--k] && ungetc(buf[k], f) == EOF) - FATAL("unable to ungetc '%c'", buf[k]); - while (k > i + patlen); - buf[k] = '\0'; + if (*--k && ungetc(*k, f) == EOF) + FATAL("unable to ungetc '%c'", *k); + while (k > patbeg + patlen); + *k = '\0'; return true; } else @@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20231116"; +const char *version = "version 20231120"; #define DEBUG #include <stdio.h> |