aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-11-29 00:10:49 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-11-29 00:10:49 +0000
commite40c4a72a7db47c7614518690749cf4093b911a0 (patch)
treeb4b77bcd6c82504db09e7e1dbf5db683b59e7c45
parent13f5893f3addb30e157763ad236e7c2156e4b198 (diff)
parentb7a213c11b2b60f1a1be18a29753132f0376d47f (diff)
downloadone-true-awk-android14-qpr2-s1-release.tar.gz
Change-Id: I6ce1405dccdee1d3b100db984c44280619d335ec
-rw-r--r--FIXES24
-rw-r--r--METADATA4
-rw-r--r--README.md4
-rw-r--r--awk.13
-rw-r--r--awk.h11
-rw-r--r--b.c279
-rwxr-xr-xbugs-fixed/REGRESS2
-rw-r--r--lex.c14
-rw-r--r--main.c2
-rw-r--r--makefile8
-rw-r--r--maketab.c4
-rw-r--r--proto.h3
-rw-r--r--run.c273
-rwxr-xr-xtestdir/Compare.tt2
-rwxr-xr-xtestdir/REGRESS2
-rwxr-xr-xtestdir/T.csv1
-rwxr-xr-xtestdir/T.flags5
-rwxr-xr-xtestdir/T.misc14
18 files changed, 356 insertions, 299 deletions
diff --git a/FIXES b/FIXES
index a13ca50..52f49e3 100644
--- a/FIXES
+++ b/FIXES
@@ -25,6 +25,29 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the
second edition of the AWK book was published in September 2023.
+Nov 24, 2023:
+ Fix issue #199: gototab improvements to dynamically resize the
+ table, qsort and bsearch to improve the lookup speed as the
+ table gets larger for multibyte input. thanks to Arnold Robbins.
+
+Nov 23, 2023:
+ Fix Issue #169, related to escape sequences in strings.
+ Thanks to Github user rajeevvp.
+ Fix Issue #147, reported by Github user drawkula, and fixed
+ by Miguel Pineiro Jr.
+
+Nov 20, 2023:
+ rewrite of fnematch to fix a number of issues, including
+ extraneous output, out-of-bounds access, number of bytes
+ to push back after a failed match etc.
+ thanks to Miguel Pineiro Jr.
+
+Nov 15, 2023:
+ Man page edit, regression test fixes. thanks to Arnold Robbins
+ consolidation of sub and gsub into dosub, removing duplicate
+ code. thanks to Miguel Pineiro Jr.
+ gcc replaced with cc everywhere.
+
Oct 30, 2023:
multiple fixes and a minor code cleanup.
disabled utf-8 for non-multibyte locales, such as C or POSIX.
@@ -32,7 +55,6 @@ Oct 30, 2023:
systems. also fixed an out-of-bounds read for empty CCL.
fixed a buffer overflow in substr with utf-8 strings.
many thanks to Todd C Miller.
-
Sep 24, 2023:
fnematch and getrune have been overhauled to solve issues around
diff --git a/METADATA b/METADATA
index 6ea18b5..2b83084 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
type: GIT
value: "https://github.com/onetrueawk/awk.git"
}
- version: "d801514094d1140dfc9f8571b9821082ddddf107"
+ version: "fbd1d5b712e27a9bb527e39ed6e9bf3b9afbb1df"
license_type: NOTICE
last_upgrade_date {
year: 2023
month: 11
- day: 6
+ day: 27
}
}
diff --git a/README.md b/README.md
index daace23..84fb06e 100644
--- a/README.md
+++ b/README.md
@@ -21,8 +21,6 @@ Aribtrary characters may be included with `\u` followed by 1 to 8 hexadecimal di
### Regular expressions ###
Regular expressions may include UTF-8 code points, including `\u`.
-Character classes are likely to be limited to about 256 characters
-when expanded.
### CSV ###
@@ -145,4 +143,4 @@ is not at the top of our priority list.
#### Last Updated
-Sun 15 Oct 2023 06:28:36 IDT
+Mon 16 Oct 2023 11:23:08 IDT
diff --git a/awk.1 b/awk.1
index 40ff0d3..ef40a01 100644
--- a/awk.1
+++ b/awk.1
@@ -586,6 +586,9 @@ the syntax is worse.
.PP
Input is expected to be UTF-8 encoded. Other multibyte
character sets are not handled.
+However, in eight-bit locales,
+.I awk
+treats each input byte as a separate character.
.SH UNUSUAL FLOATING-POINT VALUES
.I Awk
was designed before IEEE 754 arithmetic defined Not-A-Number (NaN)
diff --git a/awk.h b/awk.h
index 217319c..76180e4 100644
--- a/awk.h
+++ b/awk.h
@@ -246,14 +246,19 @@ typedef struct rrow {
int *lfollow;
} rrow;
-typedef struct gtt { /* gototab entry */
+typedef struct gtte { /* gototab entry */
unsigned int ch;
unsigned int state;
+} gtte;
+
+typedef struct gtt { /* gototab */
+ size_t allocated;
+ size_t inuse;
+ gtte *entries;
} gtt;
typedef struct fa {
- gtt **gototab;
- int gototab_len;
+ gtt *gototab;
uschar *out;
uschar *restr;
int **posns;
diff --git a/b.c b/b.c
index aa07d59..881c052 100644
--- a/b.c
+++ b/b.c
@@ -96,9 +96,8 @@ extern int u8_nextlen(const char *s);
mechanism of the goto table used 8-bit byte indices into the
gototab entries to compute the next state. Unicode is a lot
bigger, so the gototab entries are now structs with a character
- and a next state, and there is a linear search of the characters
- to find the state. (Yes, this is slower, by a significant
- amount. Tough.)
+ and a next state. These are sorted by code point and binary
+ searched.
Throughout the RE mechanism in b.c, utf-8 characters are
converted to their utf-32 value. This mostly shows up in
@@ -113,8 +112,10 @@ extern int u8_nextlen(const char *s);
*/
+static int entry_cmp(const void *l, const void *r);
static int get_gototab(fa*, int, int);
static int set_gototab(fa*, int, int, int);
+static void clear_gototab(fa*, int);
extern int u8_rune(int *, const uschar *);
static int *
@@ -142,7 +143,7 @@ resizesetvec(const char *f)
static void
resize_state(fa *f, int state)
{
- gtt **p;
+ gtt *p;
uschar *p2;
int **p3;
int i, new_count;
@@ -152,7 +153,7 @@ resize_state(fa *f, int state)
new_count = state + 10; /* needs to be tuned */
- p = (gtt **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
+ p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
if (p == NULL)
goto out;
f->gototab = p;
@@ -168,13 +169,14 @@ resize_state(fa *f, int state)
f->posns = p3;
for (i = f->state_count; i < new_count; ++i) {
- f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
- if (f->gototab[i] == NULL)
+ f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
+ if (f->gototab[i].entries == NULL)
goto out;
- f->out[i] = 0;
+ f->gototab[i].allocated = NCHARS;
+ f->gototab[i].inuse = 0;
+ f->out[i] = 0;
f->posns[i] = NULL;
}
- f->gototab_len = NCHARS; /* should be variable, growable */
f->state_count = new_count;
return;
out:
@@ -268,8 +270,7 @@ int makeinit(fa *f, bool anchor)
}
if ((f->posns[2])[1] == f->accept)
f->out[2] = 1;
- for (i = 0; i < NCHARS; i++)
- set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */
+ clear_gototab(f, 2);
f->curstat = cgoto(f, 2, HAT);
if (anchor) {
*f->posns[2] = k-1; /* leave out position 0 */
@@ -595,32 +596,104 @@ int member(int c, int *sarg) /* is c in s? */
return(0);
}
+static void resize_gototab(fa *f, int state)
+{
+ size_t new_size = f->gototab[state].allocated * 2;
+ gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
+ if (p == NULL)
+ overflo(__func__);
+
+ // need to initialized the new memory to zero
+ size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
+ memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
+
+ f->gototab[state].allocated = new_size; // update gotottab info
+ f->gototab[state].entries = p;
+}
+
static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
{
- int i;
- for (i = 0; i < f->gototab_len; i++) {
- if (f->gototab[state][i].ch == 0)
- break;
- if (f->gototab[state][i].ch == ch)
- return f->gototab[state][i].state;
- }
- return 0;
+ gtte key;
+ gtte *item;
+
+ key.ch = ch;
+ key.state = 0; /* irrelevant */
+ item = bsearch(& key, f->gototab[state].entries,
+ f->gototab[state].inuse, sizeof(gtte),
+ entry_cmp);
+
+ if (item == NULL)
+ return 0;
+ else
+ return item->state;
+}
+
+static int entry_cmp(const void *l, const void *r)
+{
+ const gtte *left, *right;
+
+ left = (const gtte *) l;
+ right = (const gtte *) r;
+
+ return left->ch - right->ch;
}
static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
{
- int i;
- for (i = 0; i < f->gototab_len; i++) {
- if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
- f->gototab[state][i].ch = ch;
- f->gototab[state][i].state = val;
- return val;
+ if (f->gototab[state].inuse == 0) {
+ f->gototab[state].entries[0].ch = ch;
+ f->gototab[state].entries[0].state = val;
+ f->gototab[state].inuse++;
+ return val;
+ } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
+ // not seen yet, insert and return
+ gtt *tab = & f->gototab[state];
+ if (tab->inuse + 1 >= tab->allocated)
+ resize_gototab(f, state);
+
+ f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
+ f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
+ f->gototab[state].inuse++;
+ return val;
+ } else {
+ // maybe we have it, maybe we don't
+ gtte key;
+ gtte *item;
+
+ key.ch = ch;
+ key.state = 0; /* irrelevant */
+ item = bsearch(& key, f->gototab[state].entries,
+ f->gototab[state].inuse, sizeof(gtte),
+ entry_cmp);
+
+ if (item != NULL) {
+ // we have it, update state and return
+ item->state = val;
+ return item->state;
}
+ // otherwise, fall through to insert and reallocate.
}
- overflo(__func__);
+
+ gtt *tab = & f->gototab[state];
+ if (tab->inuse + 1 >= tab->allocated)
+ resize_gototab(f, state);
+ ++tab->inuse;
+ f->gototab[state].entries[tab->inuse].ch = ch;
+ f->gototab[state].entries[tab->inuse].state = val;
+
+ qsort(f->gototab[state].entries,
+ f->gototab[state].inuse, sizeof(gtte), entry_cmp);
+
return val; /* not used anywhere at the moment */
}
+static void clear_gototab(fa *f, int state)
+{
+ memset(f->gototab[state].entries, 0,
+ f->gototab[state].allocated * sizeof(gtte));
+ f->gototab[state].inuse = 0;
+}
+
int match(fa *f, const char *p0) /* shortest match ? */
{
int s, ns;
@@ -759,59 +832,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
-// Read one rune at a time from the given FILE*. Return both
-// the bytes and the actual rune.
-
-struct runedata {
- int rune;
- size_t len;
- char bytes[6];
-};
-
-struct runedata getrune(FILE *fp)
-{
- struct runedata result;
- int c, next;
-
- memset(&result, 0, sizeof(result));
-
- c = getc(fp);
- if (c == EOF)
- return result; // result.rune == 0 --> EOF
- else if (c < 128 || awk_mb_cur_max == 1) {
- result.bytes[0] = c;
- result.len = 1;
- result.rune = c;
-
- return result;
- }
-
- // need to get bytes and fill things in
- result.bytes[0] = c;
- result.len = 1;
-
- next = 1;
- for (int i = 1; i < MAX_UTF_BYTES; i++) {
- c = getc(fp);
- if (c == EOF)
- break;
- result.bytes[next++] = c;
- result.len++;
- }
-
- // put back any extra input bytes
- int actual_len = u8_nextlen(result.bytes);
- while (result.len > actual_len) {
- ungetc(result.bytes[--result.len], fp);
- }
-
- result.bytes[result.len] = '\0';
- (void) u8_rune(& result.rune, (uschar *) result.bytes);
-
- return result;
-}
-
-
/*
* NAME
* fnematch
@@ -829,58 +849,76 @@ struct runedata getrune(FILE *fp)
bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
{
- char *buf = *pbuf;
+ char *i, *j, *k, *buf = *pbuf;
int bufsize = *pbufsize;
- int i, j, k, ns, s;
- struct runedata r;
+ int c, n, ns, s;
s = pfa->initstat;
patlen = 0;
/*
- * All indices relative to buf.
- * i <= j <= k <= bufsize
+ * buf <= i <= j <= k <= buf+bufsize
*
- * i: origin of active substring (first byte of first character)
- * j: current character (last byte of current character)
- * k: destination of next getc()
+ * i: origin of active substring
+ * j: current character
+ * k: destination of the next getc
*/
- i = -1, k = 0;
- do {
- j = i++;
- do {
- r = getrune(f);
- if ((++j + r.len) >= k) {
- if (k >= bufsize)
- if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
- FATAL("stream '%.30s...' too long", buf);
- }
- memcpy(buf + k, r.bytes, r.len);
- j += r.len - 1; // incremented next time around the loop
- k += r.len;
- if ((ns = get_gototab(pfa, s, r.rune)) != 0)
- s = ns;
- else
- s = cgoto(pfa, s, r.rune);
+ i = j = k = buf;
- if (pfa->out[s]) { /* final state */
- patlen = j - i + 1;
- if (r.rune == 0) /* don't count $ */
- patlen--;
+ do {
+ /*
+ * Call u8_rune with at least MAX_UTF_BYTES ahead in
+ * the buffer until EOF interferes.
+ */
+ if (k - j < MAX_UTF_BYTES) {
+ if (k + MAX_UTF_BYTES > buf + bufsize) {
+ adjbuf((char **) &buf, &bufsize,
+ bufsize + MAX_UTF_BYTES,
+ quantum, 0, "fnematch");
}
- } while (buf[j] && s != 1);
+ for (n = MAX_UTF_BYTES ; n > 0; n--) {
+ *k++ = (c = getc(f)) != EOF ? c : 0;
+ if (c == EOF) {
+ if (ferror(f))
+ FATAL("fnematch: getc error");
+ break;
+ }
+ }
+ }
+
+ j += u8_rune(&c, (uschar *)j);
+
+ if ((ns = get_gototab(pfa, s, c)) != 0)
+ s = ns;
+ else
+ s = cgoto(pfa, s, c);
+
+ if (pfa->out[s]) { /* final state */
+ patbeg = i;
+ patlen = j - i;
+ if (c == 0) /* don't count $ */
+ patlen--;
+ }
+
+ if (c && s != 1)
+ continue; /* origin i still viable, next j */
+ if (patlen)
+ break; /* best match found */
+
+ /* no match at origin i, next i and start over */
+ i += u8_rune(&c, (uschar *)i);
+ if (c == 0)
+ break; /* no match */
+ j = i;
s = 2;
- if (r.len > 1)
- i += r.len - 1; // i incremented around the loop
- } while (buf[i] && !patlen);
+ } while (1);
/* adjbuf() may have relocated a resized buffer. Inform the world. */
*pbuf = buf;
*pbufsize = bufsize;
if (patlen) {
- patbeg = (char *) buf + i;
/*
* Under no circumstances is the last character fed to
* the automaton part of the match. It is EOF's nullbyte,
@@ -893,11 +931,10 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
* terminate the buffer.
*/
do
- for (int ii = r.len; ii > 0; ii--)
- if (buf[--k] && ungetc(buf[k], f) == EOF)
- FATAL("unable to ungetc '%c'", buf[k]);
- while (k > i + patlen);
- buf[k] = '\0';
+ if (*--k && ungetc(*k, f) == EOF)
+ FATAL("unable to ungetc '%c'", *k);
+ while (k > patbeg + patlen);
+ *k = '\0';
return true;
}
else
@@ -1486,8 +1523,7 @@ int cgoto(fa *f, int s, int c)
/* add tmpset to current set of states */
++(f->curstat);
resize_state(f, f->curstat);
- for (i = 0; i < NCHARS; i++)
- set_gototab(f, f->curstat, 0, 0);
+ clear_gototab(f, f->curstat);
xfree(f->posns[f->curstat]);
p = intalloc(setcnt + 1, __func__);
@@ -1511,7 +1547,8 @@ void freefa(fa *f) /* free a finite automaton */
if (f == NULL)
return;
for (i = 0; i < f->state_count; i++)
- xfree(f->gototab[i])
+ xfree(f->gototab[i].entries);
+ xfree(f->gototab);
for (i = 0; i <= f->curstat; i++)
xfree(f->posns[i]);
for (i = 0; i <= f->accept; i++) {
diff --git a/bugs-fixed/REGRESS b/bugs-fixed/REGRESS
index 0716003..98d578a 100755
--- a/bugs-fixed/REGRESS
+++ b/bugs-fixed/REGRESS
@@ -1,4 +1,4 @@
-#! /bin/bash
+#! /bin/sh
if [ ! -f ../a.out ]
then
diff --git a/lex.c b/lex.c
index 675c116..0473a33 100644
--- a/lex.c
+++ b/lex.c
@@ -421,8 +421,12 @@ int string(void)
{
int i;
+ if (!isxdigit(peek())) {
+ unput(c);
+ break;
+ }
n = 0;
- for (i = 1; i <= 2; i++) {
+ for (i = 0; i < 2; i++) {
c = input();
if (c == 0)
break;
@@ -433,13 +437,13 @@ int string(void)
n += (c - '0');
else
n += 10 + (c - 'a');
- } else
+ } else {
+ unput(c);
break;
+ }
}
- if (n)
+ if (i)
*bp++ = n;
- else
- unput(c);
break;
}
diff --git a/main.c b/main.c
index 3a205c8..c478e32 100644
--- a/main.c
+++ b/main.c
@@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
-const char *version = "version 20231030";
+const char *version = "version 20231124";
#define DEBUG
#include <stdio.h>
diff --git a/makefile b/makefile
index df966ef..b47a8af 100644
--- a/makefile
+++ b/makefile
@@ -28,10 +28,10 @@ CFLAGS =
CFLAGS = -O2
# compiler options
-#CC = gcc -Wall -g -Wwrite-strings
-#CC = gcc -O4 -Wall -pedantic -fno-strict-aliasing
-#CC = gcc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov
-HOSTCC = gcc -g -Wall -pedantic -Wcast-qual
+#CC = cc -Wall -g -Wwrite-strings
+#CC = cc -O4 -Wall -pedantic -fno-strict-aliasing
+#CC = cc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov
+HOSTCC = cc -g -Wall -pedantic -Wcast-qual
CC = $(HOSTCC) # change this is cross-compiling.
# By fiat, to make our lives easier, yacc is now defined to be bison.
diff --git a/maketab.c b/maketab.c
index 433541e..3747efa 100644
--- a/maketab.c
+++ b/maketab.c
@@ -52,8 +52,8 @@ struct xx
{ ARRAY, "array", NULL },
{ INDIRECT, "indirect", "$(" },
{ SUBSTR, "substr", "substr" },
- { SUB, "sub", "sub" },
- { GSUB, "gsub", "gsub" },
+ { SUB, "dosub", "sub" },
+ { GSUB, "dosub", "gsub" },
{ INDEX, "sindex", "sindex" },
{ SPRINTF, "awksprintf", "sprintf " },
{ ADD, "arith", " + " },
diff --git a/proto.h b/proto.h
index cb4988e..ed63e78 100644
--- a/proto.h
+++ b/proto.h
@@ -196,8 +196,7 @@ extern FILE *openfile(int, const char *, bool *);
extern const char *filename(FILE *);
extern Cell *closefile(Node **, int);
extern void closeall(void);
-extern Cell *sub(Node **, int);
-extern Cell *gsub(Node **, int);
+extern Cell *dosub(Node **, int);
extern FILE *popen(const char *, const char *);
extern int pclose(FILE *);
diff --git a/run.c b/run.c
index a9ef242..7462c38 100644
--- a/run.c
+++ b/run.c
@@ -1540,8 +1540,9 @@ Cell *assign(Node **a, int n) /* a[0] = a[1], a[0] += a[1], etc. */
if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
; /* self-assignment: leave alone unless it's a field or NF */
else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
+ yf = getfval(y);
setsval(x, getsval(y));
- x->fval = getfval(y);
+ x->fval = yf;
x->tval |= NUM;
}
else if (isstr(y))
@@ -2397,169 +2398,143 @@ static void flush_all(void)
void backsub(char **pb_ptr, const char **sptr_ptr);
-Cell *sub(Node **a, int nnn) /* substitute command */
+Cell *dosub(Node **a, int subop) /* sub and gsub */
{
- const char *sptr, *q;
- Cell *x, *y, *result;
- char *t, *buf, *pb;
fa *pfa;
+ int tempstat;
+ char *repl;
+ Cell *x;
+
+ char *buf = NULL;
+ char *pb = NULL;
int bufsz = recsize;
- if ((buf = (char *) malloc(bufsz)) == NULL)
- FATAL("out of memory in sub");
- x = execute(a[3]); /* target string */
- t = getsval(x);
- if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
- pfa = (fa *) a[1]; /* regular expression */
- else {
- y = execute(a[1]);
- pfa = makedfa(getsval(y), 1);
- tempfree(y);
+ const char *r, *s;
+ const char *start;
+ const char *noempty = NULL; /* empty match disallowed here */
+ size_t m = 0; /* match count */
+ size_t whichm; /* which match to select, 0 = global */
+ int mtype; /* match type */
+
+ if (a[0] == NULL) { /* 0 => a[1] is already-compiled regexpr */
+ pfa = (fa *) a[1];
+ } else {
+ x = execute(a[1]);
+ pfa = makedfa(getsval(x), 1);
+ tempfree(x);
}
- y = execute(a[2]); /* replacement string */
- result = False;
- if (pmatch(pfa, t)) {
- sptr = t;
- adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");
- pb = buf;
- while (sptr < patbeg)
- *pb++ = *sptr++;
- sptr = getsval(y);
- while (*sptr != '\0') {
- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");
- if (*sptr == '\\') {
- backsub(&pb, &sptr);
- } else if (*sptr == '&') {
- sptr++;
- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");
- for (q = patbeg; q < patbeg+patlen; )
- *pb++ = *q++;
- } else
- *pb++ = *sptr++;
+
+ x = execute(a[2]); /* replacement string */
+ repl = tostring(getsval(x));
+ tempfree(x);
+
+ switch (subop) {
+ case SUB:
+ whichm = 1;
+ x = execute(a[3]); /* source string */
+ break;
+ case GSUB:
+ whichm = 0;
+ x = execute(a[3]); /* source string */
+ break;
+ default:
+ FATAL("dosub: unrecognized subop: %d", subop);
+ }
+
+ start = getsval(x);
+ while (pmatch(pfa, start)) {
+ if (buf == NULL) {
+ if ((pb = buf = malloc(bufsz)) == NULL)
+ FATAL("out of memory in dosub");
+ tempstat = pfa->initstat;
+ pfa->initstat = 2;
}
- *pb = '\0';
- if (pb > buf + bufsz)
- FATAL("sub result1 %.30s too big; can't happen", buf);
- sptr = patbeg + patlen;
- if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {
- adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");
- while ((*pb++ = *sptr++) != '\0')
- continue;
+
+ /* match types */
+ #define MT_IGNORE 0 /* unselected or invalid */
+ #define MT_INSERT 1 /* selected, empty */
+ #define MT_REPLACE 2 /* selected, not empty */
+
+ /* an empty match just after replacement is invalid */
+
+ if (patbeg == noempty && patlen == 0) {
+ mtype = MT_IGNORE; /* invalid, not counted */
+ } else if (whichm == ++m || whichm == 0) {
+ mtype = patlen ? MT_REPLACE : MT_INSERT;
+ } else {
+ mtype = MT_IGNORE; /* unselected, but counted */
}
- if (pb > buf + bufsz)
- FATAL("sub result2 %.30s too big; can't happen", buf);
- setsval(x, buf); /* BUG: should be able to avoid copy */
- result = True;
- }
- tempfree(x);
- tempfree(y);
- free(buf);
- return result;
-}
-Cell *gsub(Node **a, int nnn) /* global substitute */
-{
- Cell *x, *y;
- char *rptr, *pb;
- const char *q, *t, *sptr;
- char *buf;
- fa *pfa;
- int mflag, tempstat, num;
- int bufsz = recsize;
- int charlen = 0;
+ /* leading text: */
+ if (patbeg > start) {
+ adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
+ recsize, &pb, "dosub");
+ s = start;
+ while (s < patbeg)
+ *pb++ = *s++;
+ }
- if ((buf = (char *) malloc(bufsz)) == NULL)
- FATAL("out of memory in gsub");
- mflag = 0; /* if mflag == 0, can replace empty string */
- num = 0;
- x = execute(a[3]); /* target string */
- t = getsval(x);
- if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
- pfa = (fa *) a[1]; /* regular expression */
- else {
- y = execute(a[1]);
- pfa = makedfa(getsval(y), 1);
- tempfree(y);
- }
- y = execute(a[2]); /* replacement string */
- if (pmatch(pfa, t)) {
- tempstat = pfa->initstat;
- pfa->initstat = 2;
- pb = buf;
- rptr = getsval(y);
- do {
- if (patlen == 0 && *patbeg != '\0') { /* matched empty string */
- if (mflag == 0) { /* can replace empty */
- num++;
- sptr = rptr;
- while (*sptr != '\0') {
- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
- if (*sptr == '\\') {
- backsub(&pb, &sptr);
- } else if (*sptr == '&') {
- sptr++;
- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
- for (q = patbeg; q < patbeg+patlen; )
- *pb++ = *q++;
- } else
- *pb++ = *sptr++;
- }
- }
- if (*t == '\0') /* at end */
- goto done;
- adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
- charlen = u8_nextlen(t);
- while (charlen-- > 0)
- *pb++ = *t++;
- if (pb > buf + bufsz) /* BUG: not sure of this test */
- FATAL("gsub result0 %.30s too big; can't happen", buf);
- mflag = 0;
+ if (mtype == MT_IGNORE)
+ goto matching_text; /* skip replacement text */
+
+ r = repl;
+ while (*r != 0) {
+ adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
+ if (*r == '\\') {
+ backsub(&pb, &r);
+ } else if (*r == '&') {
+ r++;
+ adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
+ &pb, "dosub");
+ for (s = patbeg; s < patbeg+patlen; )
+ *pb++ = *s++;
+ } else {
+ *pb++ = *r++;
}
- else { /* matched nonempty string */
- num++;
- sptr = t;
- adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");
- while (sptr < patbeg)
- *pb++ = *sptr++;
- sptr = rptr;
- while (*sptr != '\0') {
- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
- if (*sptr == '\\') {
- backsub(&pb, &sptr);
- } else if (*sptr == '&') {
- sptr++;
- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
- for (q = patbeg; q < patbeg+patlen; )
- *pb++ = *q++;
- } else
- *pb++ = *sptr++;
- }
- t = patbeg + patlen;
- if (patlen == 0 || *t == '\0' || *(t-1) == '\0')
- goto done;
- if (pb > buf + bufsz)
- FATAL("gsub result1 %.30s too big; can't happen", buf);
- mflag = 1;
- }
- } while (pmatch(pfa,t));
- sptr = t;
- adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");
- while ((*pb++ = *sptr++) != '\0')
- continue;
- done: if (pb < buf + bufsz)
- *pb = '\0';
- else if (*(pb-1) != '\0')
- FATAL("gsub result2 %.30s truncated; can't happen", buf);
- setsval(x, buf); /* BUG: should be able to avoid copy + free */
+ }
+
+matching_text:
+ if (mtype == MT_REPLACE || *patbeg == '\0')
+ goto next_search; /* skip matching text */
+
+ if (patlen == 0)
+ patlen = u8_nextlen(patbeg);
+ adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
+ s = patbeg;
+ while (s < patbeg + patlen)
+ *pb++ = *s++;
+
+next_search:
+ start = patbeg + patlen;
+ if (m == whichm || *patbeg == '\0')
+ break;
+ if (mtype == MT_REPLACE)
+ noempty = start;
+
+ #undef MT_IGNORE
+ #undef MT_INSERT
+ #undef MT_REPLACE
+ }
+
+ xfree(repl);
+
+ if (buf != NULL) {
pfa->initstat = tempstat;
+
+ /* trailing text */
+ adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
+ while ((*pb++ = *start++) != '\0')
+ ;
+
+ setsval(x, buf);
+ free(buf);
}
+
tempfree(x);
- tempfree(y);
x = gettemp();
x->tval = NUM;
- x->fval = num;
- free(buf);
- return(x);
+ x->fval = m;
+ return x;
}
void backsub(char **pb_ptr, const char **sptr_ptr) /* handle \\& variations */
diff --git a/testdir/Compare.tt b/testdir/Compare.tt
index ca828d2..4b297d7 100755
--- a/testdir/Compare.tt
+++ b/testdir/Compare.tt
@@ -4,7 +4,7 @@ oldawk=${oldawk-awk}
awk=${awk-../a.out}
echo compiling time.c
-gcc time.c -o time
+cc time.c -o time
time=./time
echo time command = $time
diff --git a/testdir/REGRESS b/testdir/REGRESS
index 5c3667f..b54ce3f 100755
--- a/testdir/REGRESS
+++ b/testdir/REGRESS
@@ -1,7 +1,7 @@
#!/bin/sh
uname -a
-gcc echo.c -o echo && echo echo compiled
+cc echo.c -o echo && echo echo compiled
oldawk=${oldawk-awk}
awk=${awk-../a.out}
diff --git a/testdir/T.csv b/testdir/T.csv
index 10da1ea..79c1510 100755
--- a/testdir/T.csv
+++ b/testdir/T.csv
@@ -77,5 +77,4 @@ a''b [a''b]
a, [a][]
"", [][]
, [][]
-a"b [a"b]
!!!!
diff --git a/testdir/T.flags b/testdir/T.flags
index 33d7c8d..17ce561 100755
--- a/testdir/T.flags
+++ b/testdir/T.flags
@@ -20,5 +20,6 @@ grep 'unknown option' foo >/dev/null || echo 'T.flags: bad unknown option'
$awk -F >foo 2>&1
grep 'no field separator' foo >/dev/null || echo 'T.flags: bad missing field separator'
-$awk -F '' >foo 2>&1
-grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator'
+### Awk is now like gawk and splits into separate characters if FS = ""
+# $awk -F '' >foo 2>&1
+# grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator'
diff --git a/testdir/T.misc b/testdir/T.misc
index 1e5c3c5..b8ed3c1 100755
--- a/testdir/T.misc
+++ b/testdir/T.misc
@@ -510,3 +510,17 @@ cmp -s foo1 foo2 || echo 'BAD: T.misc exit status on I/O error'
echo 1b >foo1
echo ab | $awk '{ sub(/a/, "b" ~ /b/); print }' >foo2
cmp -s foo1 foo2 || echo 'BAD: T.misc lexer regex buffer clobbered'
+
+# Check handling of octal \OOO and hex \xHH esc. seqs. in strings.
+echo 'hello888
+hello
+hello
+helloxGOO
+hello
+0A' > foo1
+$awk 'BEGIN { print "hello\888" }' > foo2
+$awk 'BEGIN { print "hello\x000A" }' >> foo2
+$awk 'BEGIN { printf "hello\x0A" }' >> foo2
+$awk 'BEGIN { print "hello\xGOO" }' >> foo2
+$awk 'BEGIN { print "hello\x0A0A" }' >> foo2
+cmp -s foo1 foo2 || echo '�BAD: T.misc escape sequences in strings mishandled'