Fix incorrect matching of 0xffffffff to any character with more than one other case in 32-bit UCP (but not UTF) mode.

author: Philip Hazel <Philip.Hazel@gmail.com> 2023-12-04 16:11:41 +0000
committer: Philip Hazel <Philip.Hazel@gmail.com> 2023-12-04 16:11:41 +0000
commit: ad73148dfb6d06280a4d87f322991762aff90a55 (patch)
tree: 48245ad586fadb6581ba3556629b665c9bb4367f
parent: 014c82d7bcc2873cdb1f3abc5e5348587f477ba4 (diff)
download: pcre-ad73148dfb6d06280a4d87f322991762aff90a55.tar.gz
10 files changed, 302 insertions, 6 deletions
diff --git a/ChangeLog b/ChangeLog
index 9dc54cfb..3fbb4a92 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -183,6 +183,10 @@ error.
 PCRE2_CASELESS and PCRE2_UCP (but not PCRE2_UTF) were set. Fixed by not trying
 to look for other cases for characters above the Unicode range.
 
+50. In caseless 32-bit mode with UCP (but not UTF) set, the character
+0xffffffff incorrectly matched any character that has more than one other case, 
+in particular k and s.
+
 
 Version 10.42 11-December-2022
 ------------------------------
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index 1c48ad67..caae6524 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -1241,6 +1241,13 @@ for (;;)
           break;
 
           case PT_CLIST:
+#if PCRE2_CODE_UNIT_WIDTH == 32
+          if (c > MAX_UTF_CODE_POINT)
+            {
+            OK = FALSE;
+            break;
+            }
+#endif
           cp = PRIV(ucd_caseless_sets) + code[2];
           for (;;)
             {
@@ -1516,6 +1523,13 @@ for (;;)
           break;
 
           case PT_CLIST:
+#if PCRE2_CODE_UNIT_WIDTH == 32
+          if (c > MAX_UTF_CODE_POINT)
+            {
+            OK = FALSE;
+            break;
+            }
+#endif
           cp = PRIV(ucd_caseless_sets) + code[3];
           for (;;)
             {
@@ -1774,6 +1788,13 @@ for (;;)
           break;
 
           case PT_CLIST:
+#if PCRE2_CODE_UNIT_WIDTH == 32
+          if (c > MAX_UTF_CODE_POINT)
+            {
+            OK = FALSE;
+            break;
+            }
+#endif
           cp = PRIV(ucd_caseless_sets) + code[3];
           for (;;)
             {
@@ -2058,6 +2079,13 @@ for (;;)
           break;
 
           case PT_CLIST:
+#if PCRE2_CODE_UNIT_WIDTH == 32
+          if (c > MAX_UTF_CODE_POINT)
+            {
+            OK = FALSE;
+            break;
+            }
+#endif
           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
           for (;;)
             {
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index d162e707..b2e1f23b 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -2565,6 +2565,13 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
         break;
 
         case PT_CLIST:
+#if PCRE2_CODE_UNIT_WIDTH == 32
+            if (fc > MAX_UTF_CODE_POINT)
+              {
+              if (notmatch) break;;
+              RRETURN(MATCH_NOMATCH);
+              }
+#endif
         cp = PRIV(ucd_caseless_sets) + Fecode[2];
         for (;;)
           {
@@ -2885,6 +2892,13 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
               RRETURN(MATCH_NOMATCH);
               }
             GETCHARINCTEST(fc, Feptr);
+#if PCRE2_CODE_UNIT_WIDTH == 32
+            if (fc > MAX_UTF_CODE_POINT)
+              {
+              if (notmatch) continue;
+              RRETURN(MATCH_NOMATCH);
+              }
+#endif
             cp = PRIV(ucd_caseless_sets) + Lpropvalue;
             for (;;)
               {
@@ -3698,6 +3712,13 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
               RRETURN(MATCH_NOMATCH);
               }
             GETCHARINCTEST(fc, Feptr);
+#if PCRE2_CODE_UNIT_WIDTH == 32
+            if (fc > MAX_UTF_CODE_POINT)
+              {
+              if (Lctype == OP_NOTPROP) continue;
+              RRETURN(MATCH_NOMATCH);
+              }
+#endif
             cp = PRIV(ucd_caseless_sets) + Lpropvalue;
             for (;;)
               {
@@ -4278,14 +4299,24 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
               break;
               }
             GETCHARLENTEST(fc, Feptr, len);
-            cp = PRIV(ucd_caseless_sets) + Lpropvalue;
-            for (;;)
+#if PCRE2_CODE_UNIT_WIDTH == 32
+            if (fc > MAX_UTF_CODE_POINT)
               {
-              if (fc < *cp)
-                { if (notmatch) break; else goto GOT_MAX; }
-              if (fc == *cp++)
-                { if (notmatch) goto GOT_MAX; else break; }
+              if (!notmatch) goto GOT_MAX;
               }
+            else
+#endif
+              {
+              cp = PRIV(ucd_caseless_sets) + Lpropvalue;
+              for (;;)
+                {
+                if (fc < *cp)
+                  { if (notmatch) break; else goto GOT_MAX; }
+                if (fc == *cp++)
+                  { if (notmatch) goto GOT_MAX; else break; }
+                }
+              }
+
             Feptr += len;
             }
           GOT_MAX:
diff --git a/testdata/testinput12 b/testdata/testinput12
index de3d4067..85550c3b 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -576,5 +576,31 @@
 # This used to loop in 32-bit mode; it will fail in 16-bit mode.
 /[\x{ffffffff}]/caseless,ucp
     \x{ffffffff}xyz
+    
+# These are 32-bit tests for handing 0xffffffff when in UCP caselsss mode. They
+# will give errors in 16-bit mode.
+
+/k*\x{ffffffff}/caseless,ucp
+    \x{ffffffff}
+
+/k+\x{ffffffff}/caseless,ucp,no_start_optimize
+    K\x{ffffffff}
+\= Expect no match     
+    \x{ffffffff}\x{ffffffff}
+
+/k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k\x{ffffffff}/caseless,ucp,no_start_optimize
+    K\x{ffffffff}
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
+\= Expect no match
+    Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
+
+# --------------------------------------------------------- 
 
 # End of testinput12
diff --git a/testdata/testinput14 b/testdata/testinput14
index 8a17ae73..8880b5ca 100644
--- a/testdata/testinput14
+++ b/testdata/testinput14
@@ -78,4 +78,31 @@
 
 # ---------------------------------------------------- 
 
+# ---------------------------------------------------- 
+# Tests for handling 0xffffffff in caseless UCP mode. They only apply to 32-bit
+# mode; for the other widths they will fail.
+
+/k*\x{ffffffff}/caseless,ucp
+    \x{ffffffff}
+
+/k+\x{ffffffff}/caseless,ucp,no_start_optimize
+    K\x{ffffffff}
+\= Expect no match     
+    \x{ffffffff}\x{ffffffff}
+
+/k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k\x{ffffffff}/caseless,ucp,no_start_optimize
+    K\x{ffffffff}
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
+\= Expect no match
+    Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
+
+# ---------------------------------------------------- 
+
 # End of testinput14
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index 9fa93fa1..616d6930 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -1827,5 +1827,42 @@ Failed: error 134 at offset 11: character code point value in \x{} or \o{} is to
 /[\x{ffffffff}]/caseless,ucp
 Failed: error 134 at offset 12: character code point value in \x{} or \o{} is too large
     \x{ffffffff}xyz
+    
+# These are 32-bit tests for handing 0xffffffff when in UCP caselsss mode. They
+# will give errors in 16-bit mode.
+
+/k*\x{ffffffff}/caseless,ucp
+Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
+    \x{ffffffff}
+
+/k+\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
+    K\x{ffffffff}
+\= Expect no match     
+    \x{ffffffff}\x{ffffffff}
+
+/k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 15: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 12: character code point value in \x{} or \o{} is too large
+    K\x{ffffffff}
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
+\= Expect no match
+    Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
+** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+No match
+
+# --------------------------------------------------------- 
 
 # End of testinput12
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index 721d8bce..3c9586e4 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -1821,5 +1821,38 @@ No match
 /[\x{ffffffff}]/caseless,ucp
     \x{ffffffff}xyz
  0: \x{ffffffff}
+    
+# These are 32-bit tests for handing 0xffffffff when in UCP caselsss mode. They
+# will give errors in 16-bit mode.
+
+/k*\x{ffffffff}/caseless,ucp
+    \x{ffffffff}
+ 0: \x{ffffffff}
+
+/k+\x{ffffffff}/caseless,ucp,no_start_optimize
+    K\x{ffffffff}
+ 0: K\x{ffffffff}
+\= Expect no match     
+    \x{ffffffff}\x{ffffffff}
+No match
+
+/k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+No match
+
+/k\x{ffffffff}/caseless,ucp,no_start_optimize
+    K\x{ffffffff}
+ 0: K\x{ffffffff}
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+No match
+
+/k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
+\= Expect no match
+    Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
+No match
+
+# --------------------------------------------------------- 
 
 # End of testinput12
diff --git a/testdata/testoutput14-16 b/testdata/testoutput14-16
index 61541f61..dd1a977a 100644
--- a/testdata/testoutput14-16
+++ b/testdata/testoutput14-16
@@ -122,4 +122,42 @@ No match
 
 # ---------------------------------------------------- 
 
+# ---------------------------------------------------- 
+# Tests for handling 0xffffffff in caseless UCP mode. They only apply to 32-bit
+# mode; for the other widths they will fail.
+
+/k*\x{ffffffff}/caseless,ucp
+Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
+    \x{ffffffff}
+
+/k+\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
+    K\x{ffffffff}
+\= Expect no match     
+    \x{ffffffff}\x{ffffffff}
+
+/k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 15: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 12: character code point value in \x{} or \o{} is too large
+    K\x{ffffffff}
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
+\= Expect no match
+    Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
+** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+No match
+
+# ---------------------------------------------------- 
+
 # End of testinput14
diff --git a/testdata/testoutput14-32 b/testdata/testoutput14-32
index f1f65b74..dc21569c 100644
--- a/testdata/testoutput14-32
+++ b/testdata/testoutput14-32
@@ -122,4 +122,38 @@ No match
 
 # ---------------------------------------------------- 
 
+# ---------------------------------------------------- 
+# Tests for handling 0xffffffff in caseless UCP mode. They only apply to 32-bit
+# mode; for the other widths they will fail.
+
+/k*\x{ffffffff}/caseless,ucp
+    \x{ffffffff}
+ 0: \x{ffffffff}
+
+/k+\x{ffffffff}/caseless,ucp,no_start_optimize
+    K\x{ffffffff}
+ 0: K\x{ffffffff}
+\= Expect no match     
+    \x{ffffffff}\x{ffffffff}
+No match
+
+/k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+No match
+
+/k\x{ffffffff}/caseless,ucp,no_start_optimize
+    K\x{ffffffff}
+ 0: K\x{ffffffff}
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+No match
+
+/k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
+\= Expect no match
+    Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
+No match
+
+# ---------------------------------------------------- 
+
 # End of testinput14
diff --git a/testdata/testoutput14-8 b/testdata/testoutput14-8
index aa624141..69285db2 100644
--- a/testdata/testoutput14-8
+++ b/testdata/testoutput14-8
@@ -122,4 +122,42 @@ Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too
 
 # ---------------------------------------------------- 
 
+# ---------------------------------------------------- 
+# Tests for handling 0xffffffff in caseless UCP mode. They only apply to 32-bit
+# mode; for the other widths they will fail.
+
+/k*\x{ffffffff}/caseless,ucp
+Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
+    \x{ffffffff}
+
+/k+\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
+    K\x{ffffffff}
+\= Expect no match     
+    \x{ffffffff}\x{ffffffff}
+
+/k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 15: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 12: character code point value in \x{} or \o{} is too large
+    K\x{ffffffff}
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
+\= Expect no match
+    Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
+** Character \x{ffffffff} is greater than 255 and UTF-8 mode is not enabled.
+** Truncation will probably give the wrong result.
+** Character \x{ffffffff} is greater than 255 and UTF-8 mode is not enabled.
+** Truncation will probably give the wrong result.
+** Character \x{ffffffff} is greater than 255 and UTF-8 mode is not enabled.
+** Truncation will probably give the wrong result.
+No match
+
+# ---------------------------------------------------- 
+
 # End of testinput14
author	Philip Hazel <Philip.Hazel@gmail.com>	2023-12-04 16:11:41 +0000
committer	Philip Hazel <Philip.Hazel@gmail.com>	2023-12-04 16:11:41 +0000
commit	ad73148dfb6d06280a4d87f322991762aff90a55 (patch)
tree	48245ad586fadb6581ba3556629b665c9bb4367f
parent	014c82d7bcc2873cdb1f3abc5e5348587f477ba4 (diff)
download	pcre-ad73148dfb6d06280a4d87f322991762aff90a55.tar.gz