Merge branch 'master' into improve-gototab

author: Arnold D. Robbins <arnold@skeeve.com> 2023-11-20 22:04:23 +0200
committer: Arnold D. Robbins <arnold@skeeve.com> 2023-11-20 22:04:23 +0200
commit: 345f907c404ff05165834601009835a42c90463d (patch)
tree: 94a291fd4eb6f1cdd327a4ca0492b43e33aab3e3
parent: 12793c04b28c84e0af830c81ea51c90ab65eb585 (diff)
parent: ad4249ec70c04690a797a8d8310b1d4ee69a357b (diff)
download: one-true-awk-345f907c404ff05165834601009835a42c90463d.tar.gz
3 files changed, 64 insertions, 94 deletions
diff --git a/FIXES b/FIXES
index bc59f69..5d2b459 100644
--- a/FIXES
+++ b/FIXES
@@ -25,10 +25,16 @@ THIS SOFTWARE.
 This file lists all bug fixes, changes, etc., made since the 
 second edition of the AWK book was published in September 2023.
 
+Nov 20, 2023
+	rewrite of fnematch to fix a number of issues, including
+	extraneous output, out-of-bounds access, number of bytes
+	to push back after a failed match etc.
+	thanks to Miguel Pineiro Jr.
+
 Nov 15, 2023
 	Man page edit, regression test fixes. thanks to Arnold Robbins
 	consolidation of sub and gsub into dosub, removing duplicate
-	code. thanks to Miguel Pineiro Jr.	
+	code. thanks to Miguel Pineiro Jr.
 	gcc replaced with cc everywhere.
 
 Oct 30, 2023:
diff --git a/b.c b/b.c
index 8fd9988..4bc43bf 100644
--- a/b.c
+++ b/b.c
@@ -827,59 +827,6 @@ int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
 
 #define MAX_UTF_BYTES	4	// UTF-8 is up to 4 bytes long
 
-// Read one rune at a time from the given FILE*. Return both
-// the bytes and the actual rune.
-
-struct runedata {
-	int rune;
-	size_t len;
-	char bytes[6];
-};
-
-struct runedata getrune(FILE *fp)
-{
-	struct runedata result;
-	int c, next;
-
-	memset(&result, 0, sizeof(result));
-
-	c = getc(fp);
-	if (c == EOF)
-		return result;	// result.rune == 0 --> EOF
-	else if (c < 128 || awk_mb_cur_max == 1) {
-		result.bytes[0] = c;
-		result.len = 1;
-		result.rune = c;
-
-		return result;
-	}
-
-	// need to get bytes and fill things in
-	result.bytes[0] = c;
-	result.len = 1;
-
-	next = 1;
-	for (int i = 1; i < MAX_UTF_BYTES; i++) {
-		c = getc(fp);
-		if (c == EOF)
-			break;
-		result.bytes[next++] = c;
-		result.len++;
-	}
-
-	// put back any extra input bytes
-	int actual_len = u8_nextlen(result.bytes);
-	while (result.len > actual_len) {
-		ungetc(result.bytes[--result.len], fp);
-	}
-
-	result.bytes[result.len] = '\0';
-	(void) u8_rune(& result.rune, (uschar *) result.bytes);
-
-	return result;
-}
-
-
 /*
  * NAME
  *     fnematch
@@ -897,58 +844,76 @@ struct runedata getrune(FILE *fp)
 
 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
 {
-	char *buf = *pbuf;
+	char *i, *j, *k, *buf = *pbuf;
 	int bufsize = *pbufsize;
-	int i, j, k, ns, s;
-	struct runedata r;
+	int c, n, ns, s;
 
 	s = pfa->initstat;
 	patlen = 0;
 
 	/*
-	 * All indices relative to buf.
-	 * i <= j <= k <= bufsize
+	 * buf <= i <= j <= k <= buf+bufsize
 	 *
-	 * i: origin of active substring (first byte of first character)
-	 * j: current character		(last byte of current character)
-	 * k: destination of next getc()
+	 * i: origin of active substring
+	 * j: current character
+	 * k: destination of the next getc
 	 */
-	i = -1, k = 0;
-        do {
-		j = i++;
-		do {
-			r = getrune(f);
-			if ((++j + r.len) >= k) {
-				if (k >= bufsize)
-					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
-						FATAL("stream '%.30s...' too long", buf);
-			}
-			memcpy(buf + k, r.bytes, r.len);
-			j += r.len - 1;	// incremented next time around the loop
-			k += r.len;
 
-			if ((ns = get_gototab(pfa, s, r.rune)) != 0)
-				s = ns;
-			else
-				s = cgoto(pfa, s, r.rune);
+	i = j = k = buf;
 
-			if (pfa->out[s]) {	/* final state */
-				patlen = j - i + 1;
-				if (r.rune == 0)	/* don't count $ */
-					patlen--;
+	do {
+		/*
+		 * Call u8_rune with at least MAX_UTF_BYTES ahead in
+		 * the buffer until EOF interferes.
+		 */
+		if (k - j < MAX_UTF_BYTES) {
+			if (k + MAX_UTF_BYTES > buf + bufsize) {
+				adjbuf((char **) &buf, &bufsize,
+				    bufsize + MAX_UTF_BYTES,
+				    quantum, 0, "fnematch");
+			}
+			for (n = MAX_UTF_BYTES ; n > 0; n--) {
+				*k++ = (c = getc(f)) != EOF ? c : 0;
+				if (c == EOF) {
+					if (ferror(f))
+						FATAL("fnematch: getc error");
+					break;
+				}
 			}
-		} while (buf[j] && s != 1);
+		}
+
+		j += u8_rune(&c, (uschar *)j);
+
+		if ((ns = get_gototab(pfa, s, c)) != 0)
+			s = ns;
+		else
+			s = cgoto(pfa, s, c);
+
+		if (pfa->out[s]) {	/* final state */
+			patbeg = i;
+			patlen = j - i;
+			if (c == 0)	/* don't count $ */
+				patlen--;
+		}
+
+		if (c && s != 1)
+			continue;  /* origin i still viable, next j */
+		if (patlen)
+			break;     /* best match found */
+
+		/* no match at origin i, next i and start over */
+		i += u8_rune(&c, (uschar *)i);
+		if (c == 0)
+			break;    /* no match */
+		j = i;
 		s = 2;
-		if (r.len > 1)
-			i += r.len - 1;	// i incremented around the loop
-	} while (buf[i] && !patlen);
+	} while (1);
 
 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
 	*pbuf = buf;
 	*pbufsize = bufsize;
 
 	if (patlen) {
-		patbeg = (char *) buf + i;
 		/*
 		 * Under no circumstances is the last character fed to
 		 * the automaton part of the match. It is EOF's nullbyte,
@@ -961,11 +926,10 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
 		 * terminate the buffer.
 		 */
 		do
-			for (int ii = r.len; ii > 0; ii--)
-				if (buf[--k] && ungetc(buf[k], f) == EOF)
-					FATAL("unable to ungetc '%c'", buf[k]);
-		while (k > i + patlen);
-		buf[k] = '\0';
+			if (*--k && ungetc(*k, f) == EOF)
+				FATAL("unable to ungetc '%c'", *k);
+		while (k > patbeg + patlen);
+		*k = '\0';
 		return true;
 	}
 	else
diff --git a/main.c b/main.c
index 5f07419..4f2d78a 100644
--- a/main.c
+++ b/main.c
@@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 THIS SOFTWARE.
 ****************************************************************/
 
-const char	*version = "version 20231116";
+const char	*version = "version 20231120";
 
 #define DEBUG
 #include <stdio.h>
author	Arnold D. Robbins <arnold@skeeve.com>	2023-11-20 22:04:23 +0200
committer	Arnold D. Robbins <arnold@skeeve.com>	2023-11-20 22:04:23 +0200
commit	345f907c404ff05165834601009835a42c90463d (patch)
tree	94a291fd4eb6f1cdd327a4ca0492b43e33aab3e3
parent	12793c04b28c84e0af830c81ea51c90ab65eb585 (diff)
parent	ad4249ec70c04690a797a8d8310b1d4ee69a357b (diff)
download	one-true-awk-345f907c404ff05165834601009835a42c90463d.tar.gz