summaryrefslogtreecommitdiff
path: root/glib/gutf8.c
diff options
context:
space:
mode:
authorMatthias Clasen <mclasen@redhat.com>2004-11-24 17:58:21 +0000
committerMatthias Clasen <matthiasc@src.gnome.org>2004-11-24 17:58:21 +0000
commit40fb4cff1019d266e4f7bdcac361d67406b54f45 (patch)
treeb0c32b50387fc95e69e8ffd912520507a20f58e6 /glib/gutf8.c
parentb8d9e050a4910a7e40d15371b3f860d68396448b (diff)
downloadglib-40fb4cff1019d266e4f7bdcac361d67406b54f45.tar.gz
Replace g_utf8_validate() with an optimized version, and clarify the docs
2004-11-24 Matthias Clasen <mclasen@redhat.com> * glib/gutf8.c: Replace g_utf8_validate() with an optimized version, and clarify the docs a bit. (#159131, Owen Taylor)
Diffstat (limited to 'glib/gutf8.c')
-rw-r--r--glib/gutf8.c219
1 files changed, 167 insertions, 52 deletions
diff --git a/glib/gutf8.c b/glib/gutf8.c
index ed830b184..a05a66aab 100644
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -1511,19 +1511,171 @@ g_ucs4_to_utf16 (const gunichar *str,
return result;
}
+#define CONTINUATION_CHAR \
+ G_STMT_START { \
+ if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
+ goto error; \
+ val <<= 6; \
+ val |= (*(guchar *)p) & 0x3f; \
+ } G_STMT_END
+
+static const gchar *
+fast_validate (const char *str)
+
+{
+ gunichar val = 0;
+ gunichar min = 0;
+ const gchar *p;
+
+ for (p = str; *p; p++)
+ {
+ if (*(guchar *)p < 128)
+ /* done */;
+ else
+ {
+ const gchar *last;
+
+ last = p;
+ if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
+ {
+ if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
+ goto error;
+ p++;
+ if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
+ goto error;
+ }
+ else
+ {
+ if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
+ {
+ min = (1 << 11);
+ val = *(guchar *)p & 0x0f;
+ goto TWO_REMAINING;
+ }
+ else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
+ {
+ min = (1 << 16);
+ val = *(guchar *)p & 0x07;
+ }
+ else
+ goto error;
+
+ p++;
+ CONTINUATION_CHAR;
+ TWO_REMAINING:
+ p++;
+ CONTINUATION_CHAR;
+ p++;
+ CONTINUATION_CHAR;
+
+ if (G_UNLIKELY (val < min))
+ goto error;
+
+ if (G_UNLIKELY (!UNICODE_VALID(val)))
+ goto error;
+ }
+
+ continue;
+
+ error:
+ return last;
+ }
+ }
+
+ return p;
+}
+
+static const gchar *
+fast_validate_len (const char *str,
+ gssize max_len)
+
+{
+ gunichar val = 0;
+ gunichar min = 0;
+ const gchar *p;
+
+ for (p = str; (max_len < 0 || (p - str) < max_len) && *p; p++)
+ {
+ if (*(guchar *)p < 128)
+ /* done */;
+ else
+ {
+ const gchar *last;
+
+ last = p;
+ if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
+ {
+ if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 2))
+ goto error;
+
+ if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
+ goto error;
+ p++;
+ if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
+ goto error;
+ }
+ else
+ {
+ if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
+ {
+ if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 3))
+ goto error;
+
+ min = (1 << 11);
+ val = *(guchar *)p & 0x0f;
+ goto TWO_REMAINING;
+ }
+ else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
+ {
+ if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 4))
+ goto error;
+
+ min = (1 << 16);
+ val = *(guchar *)p & 0x07;
+ }
+ else
+ goto error;
+
+ p++;
+ CONTINUATION_CHAR;
+ TWO_REMAINING:
+ p++;
+ CONTINUATION_CHAR;
+ p++;
+ CONTINUATION_CHAR;
+
+ if (G_UNLIKELY (val < min))
+ goto error;
+ if (G_UNLIKELY (!UNICODE_VALID(val)))
+ goto error;
+ }
+
+ continue;
+
+ error:
+ return last;
+ }
+ }
+
+ return p;
+}
+
/**
* g_utf8_validate:
* @str: a pointer to character data
- * @max_len: max bytes to validate, or -1 to go until nul
+ * @max_len: max bytes to validate, or -1 to go until NUL
* @end: return location for end of valid data
*
* Validates UTF-8 encoded text. @str is the text to validate;
* if @str is nul-terminated, then @max_len can be -1, otherwise
* @max_len should be the number of bytes to validate.
* If @end is non-%NULL, then the end of the valid range
- * will be stored there (i.e. the address of the first invalid byte
- * if some bytes were invalid, or the end of the text being validated
- * otherwise).
+ * will be stored there (i.e. the start of the first invalid
+ * character if some bytes were invalid, or the end of the text
+ * being validated otherwise).
+ *
+ * Note that g_utf8_validate() returns %FALSE if @max_len is
+ * positive and NUL is met before @max_len bytes have been read.
*
* Returns %TRUE if all of @str was valid. Many GLib and GTK+
* routines <emphasis>require</emphasis> valid UTF-8 as input;
@@ -1533,66 +1685,29 @@ g_ucs4_to_utf16 (const gunichar *str,
* Return value: %TRUE if the text was valid UTF-8
**/
gboolean
-g_utf8_validate (const gchar *str,
- gssize max_len,
- const gchar **end)
-{
+g_utf8_validate (const char *str,
+ gssize max_len,
+ const gchar **end)
+{
const gchar *p;
- g_return_val_if_fail (str != NULL, FALSE);
-
- if (end)
- *end = str;
-
- p = str;
-
- while ((max_len < 0 || (p - str) < max_len) && *p)
- {
- int i, mask = 0, len;
- gunichar result;
- unsigned char c = (unsigned char) *p;
-
- UTF8_COMPUTE (c, mask, len);
-
- if (len == -1)
- break;
-
- /* check that the expected number of bytes exists in str */
- if (max_len >= 0 &&
- ((max_len - (p - str)) < len))
- break;
-
- UTF8_GET (result, p, i, mask, len);
-
- if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
- break;
-
- if (result == (gunichar)-1)
- break;
-
- if (!UNICODE_VALID (result))
- break;
-
- p += len;
- }
+ if (max_len < 0)
+ p = fast_validate (str);
+ else
+ p = fast_validate_len (str, max_len);
if (end)
*end = p;
- /* See that we covered the entire length if a length was
- * passed in, or that we ended on a nul if not
- */
- if (max_len >= 0 &&
- p != (str + max_len))
- return FALSE;
- else if (max_len < 0 &&
- *p != '\0')
+ if ((max_len >= 0 && p != str + max_len) ||
+ (max_len < 0 && *p != '\0'))
return FALSE;
else
return TRUE;
}
+
/**
* g_unichar_validate:
* @ch: a Unicode character