aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2023-11-20 22:04:23 +0200
committerArnold D. Robbins <arnold@skeeve.com>2023-11-20 22:04:23 +0200
commit345f907c404ff05165834601009835a42c90463d (patch)
tree94a291fd4eb6f1cdd327a4ca0492b43e33aab3e3
parent12793c04b28c84e0af830c81ea51c90ab65eb585 (diff)
parentad4249ec70c04690a797a8d8310b1d4ee69a357b (diff)
downloadone-true-awk-345f907c404ff05165834601009835a42c90463d.tar.gz
Merge branch 'master' into improve-gototab
-rw-r--r--FIXES8
-rw-r--r--b.c148
-rw-r--r--main.c2
3 files changed, 64 insertions, 94 deletions
diff --git a/FIXES b/FIXES
index bc59f69..5d2b459 100644
--- a/FIXES
+++ b/FIXES
@@ -25,10 +25,16 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the
second edition of the AWK book was published in September 2023.
+Nov 20, 2023
+ rewrite of fnematch to fix a number of issues, including
+ extraneous output, out-of-bounds access, number of bytes
+ to push back after a failed match etc.
+ thanks to Miguel Pineiro Jr.
+
Nov 15, 2023
Man page edit, regression test fixes. thanks to Arnold Robbins
consolidation of sub and gsub into dosub, removing duplicate
- code. thanks to Miguel Pineiro Jr.
+ code. thanks to Miguel Pineiro Jr.
gcc replaced with cc everywhere.
Oct 30, 2023:
diff --git a/b.c b/b.c
index 8fd9988..4bc43bf 100644
--- a/b.c
+++ b/b.c
@@ -827,59 +827,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
-// Read one rune at a time from the given FILE*. Return both
-// the bytes and the actual rune.
-
-struct runedata {
- int rune;
- size_t len;
- char bytes[6];
-};
-
-struct runedata getrune(FILE *fp)
-{
- struct runedata result;
- int c, next;
-
- memset(&result, 0, sizeof(result));
-
- c = getc(fp);
- if (c == EOF)
- return result; // result.rune == 0 --> EOF
- else if (c < 128 || awk_mb_cur_max == 1) {
- result.bytes[0] = c;
- result.len = 1;
- result.rune = c;
-
- return result;
- }
-
- // need to get bytes and fill things in
- result.bytes[0] = c;
- result.len = 1;
-
- next = 1;
- for (int i = 1; i < MAX_UTF_BYTES; i++) {
- c = getc(fp);
- if (c == EOF)
- break;
- result.bytes[next++] = c;
- result.len++;
- }
-
- // put back any extra input bytes
- int actual_len = u8_nextlen(result.bytes);
- while (result.len > actual_len) {
- ungetc(result.bytes[--result.len], fp);
- }
-
- result.bytes[result.len] = '\0';
- (void) u8_rune(& result.rune, (uschar *) result.bytes);
-
- return result;
-}
-
-
/*
* NAME
* fnematch
@@ -897,58 +844,76 @@ struct runedata getrune(FILE *fp)
bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
{
- char *buf = *pbuf;
+ char *i, *j, *k, *buf = *pbuf;
int bufsize = *pbufsize;
- int i, j, k, ns, s;
- struct runedata r;
+ int c, n, ns, s;
s = pfa->initstat;
patlen = 0;
/*
- * All indices relative to buf.
- * i <= j <= k <= bufsize
+ * buf <= i <= j <= k <= buf+bufsize
*
- * i: origin of active substring (first byte of first character)
- * j: current character (last byte of current character)
- * k: destination of next getc()
+ * i: origin of active substring
+ * j: current character
+ * k: destination of the next getc
*/
- i = -1, k = 0;
- do {
- j = i++;
- do {
- r = getrune(f);
- if ((++j + r.len) >= k) {
- if (k >= bufsize)
- if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
- FATAL("stream '%.30s...' too long", buf);
- }
- memcpy(buf + k, r.bytes, r.len);
- j += r.len - 1; // incremented next time around the loop
- k += r.len;
- if ((ns = get_gototab(pfa, s, r.rune)) != 0)
- s = ns;
- else
- s = cgoto(pfa, s, r.rune);
+ i = j = k = buf;
- if (pfa->out[s]) { /* final state */
- patlen = j - i + 1;
- if (r.rune == 0) /* don't count $ */
- patlen--;
+ do {
+ /*
+ * Call u8_rune with at least MAX_UTF_BYTES ahead in
+ * the buffer until EOF interferes.
+ */
+ if (k - j < MAX_UTF_BYTES) {
+ if (k + MAX_UTF_BYTES > buf + bufsize) {
+ adjbuf((char **) &buf, &bufsize,
+ bufsize + MAX_UTF_BYTES,
+ quantum, 0, "fnematch");
+ }
+ for (n = MAX_UTF_BYTES ; n > 0; n--) {
+ *k++ = (c = getc(f)) != EOF ? c : 0;
+ if (c == EOF) {
+ if (ferror(f))
+ FATAL("fnematch: getc error");
+ break;
+ }
}
- } while (buf[j] && s != 1);
+ }
+
+ j += u8_rune(&c, (uschar *)j);
+
+ if ((ns = get_gototab(pfa, s, c)) != 0)
+ s = ns;
+ else
+ s = cgoto(pfa, s, c);
+
+ if (pfa->out[s]) { /* final state */
+ patbeg = i;
+ patlen = j - i;
+ if (c == 0) /* don't count $ */
+ patlen--;
+ }
+
+ if (c && s != 1)
+ continue; /* origin i still viable, next j */
+ if (patlen)
+ break; /* best match found */
+
+ /* no match at origin i, next i and start over */
+ i += u8_rune(&c, (uschar *)i);
+ if (c == 0)
+ break; /* no match */
+ j = i;
s = 2;
- if (r.len > 1)
- i += r.len - 1; // i incremented around the loop
- } while (buf[i] && !patlen);
+ } while (1);
/* adjbuf() may have relocated a resized buffer. Inform the world. */
*pbuf = buf;
*pbufsize = bufsize;
if (patlen) {
- patbeg = (char *) buf + i;
/*
* Under no circumstances is the last character fed to
* the automaton part of the match. It is EOF's nullbyte,
@@ -961,11 +926,10 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
* terminate the buffer.
*/
do
- for (int ii = r.len; ii > 0; ii--)
- if (buf[--k] && ungetc(buf[k], f) == EOF)
- FATAL("unable to ungetc '%c'", buf[k]);
- while (k > i + patlen);
- buf[k] = '\0';
+ if (*--k && ungetc(*k, f) == EOF)
+ FATAL("unable to ungetc '%c'", *k);
+ while (k > patbeg + patlen);
+ *k = '\0';
return true;
}
else
diff --git a/main.c b/main.c
index 5f07419..4f2d78a 100644
--- a/main.c
+++ b/main.c
@@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
-const char *version = "version 20231116";
+const char *version = "version 20231120";
#define DEBUG
#include <stdio.h>