[Gnucash-changes] r14466 - gnucash/trunk - Consider certain control character as invalid UTF-8. Fixes #346535.
Derek Atkins
warlord at cvs.gnucash.org
Wed Jul 5 12:04:38 EDT 2006
Author: warlord
Date: 2006-07-05 12:04:36 -0400 (Wed, 05 Jul 2006)
New Revision: 14466
Trac: http://svn.gnucash.org/trac/changeset/14466
Modified:
gnucash/trunk/ChangeLog
gnucash/trunk/src/core-utils/gnc-glib-utils.c
gnucash/trunk/src/core-utils/gnc-glib-utils.h
gnucash/trunk/src/core-utils/gw-core-utils-spec.scm
Log:
Consider certain control character as invalid UTF-8. Fixes #346535.
* src/core-utils/gnc-glib-utils.h
* src/core-utils/gw-core-utils-spec.scm:
Remove the gnc_utf8_validate() API. It's not used anywhere.
* src/core-utils/gnc-glib-utils.c:
Rework gnc_utf8_validate() as a copy-and-paste of g_utf8_validate
but ignore certain control characters between 0x00 and 0x20
that are not valid XML characters. Fixes #346535.
Modified: gnucash/trunk/ChangeLog
===================================================================
--- gnucash/trunk/ChangeLog 2006-07-04 20:35:52 UTC (rev 14465)
+++ gnucash/trunk/ChangeLog 2006-07-05 16:04:36 UTC (rev 14466)
@@ -1,3 +1,13 @@
+2006-07-04 Derek Atkins <derek at ihtfp.com>
+
+ * src/core-utils/gnc-glib-utils.h
+ * src/core-utils/gw-core-utils-spec.scm:
+ Remove the gnc_utf8_validate() API. It's not used anywhere.
+ * src/core-utils/gnc-glib-utils.c:
+ Rework gnc_utf8_validate() as a copy-and-paste of g_utf8_validate
+ but ignore certain control characters between 0x00 and 0x20
+ that are not valid XML characters. Fixes #346535.
+
2006-07-04 Christian Stimming <stimming at tuhh.de>
* po/de.po: Proof-read and improved German translation by feedback
Modified: gnucash/trunk/src/core-utils/gnc-glib-utils.c
===================================================================
--- gnucash/trunk/src/core-utils/gnc-glib-utils.c 2006-07-04 20:35:52 UTC (rev 14465)
+++ gnucash/trunk/src/core-utils/gnc-glib-utils.c 2006-07-05 16:04:36 UTC (rev 14466)
@@ -44,10 +44,159 @@
return 0;
}
-gboolean
-gnc_utf8_validate (const gchar *str)
+/********************************************************************
+ * The following definitions are from gutf8.c, for use by
+ * gnc_utf8_validate(). These are all verbatim copies, except for
+ * UNICODE_VALID() which has been modified to look for the strict
+ * subset of UTF-8 that is valid XML text.
+ */
+
+#define UTF8_COMPUTE(Char, Mask, Len) \
+ if (Char < 128) \
+ { \
+ Len = 1; \
+ Mask = 0x7f; \
+ } \
+ else if ((Char & 0xe0) == 0xc0) \
+ { \
+ Len = 2; \
+ Mask = 0x1f; \
+ } \
+ else if ((Char & 0xf0) == 0xe0) \
+ { \
+ Len = 3; \
+ Mask = 0x0f; \
+ } \
+ else if ((Char & 0xf8) == 0xf0) \
+ { \
+ Len = 4; \
+ Mask = 0x07; \
+ } \
+ else if ((Char & 0xfc) == 0xf8) \
+ { \
+ Len = 5; \
+ Mask = 0x03; \
+ } \
+ else if ((Char & 0xfe) == 0xfc) \
+ { \
+ Len = 6; \
+ Mask = 0x01; \
+ } \
+ else \
+ Len = -1;
+
+#define UTF8_LENGTH(Char) \
+ ((Char) < 0x80 ? 1 : \
+ ((Char) < 0x800 ? 2 : \
+ ((Char) < 0x10000 ? 3 : \
+ ((Char) < 0x200000 ? 4 : \
+ ((Char) < 0x4000000 ? 5 : 6)))))
+
+
+#define UTF8_GET(Result, Chars, Count, Mask, Len) \
+ (Result) = (Chars)[0] & (Mask); \
+ for ((Count) = 1; (Count) < (Len); ++(Count)) \
+ { \
+ if (((Chars)[(Count)] & 0xc0) != 0x80) \
+ { \
+ (Result) = -1; \
+ break; \
+ } \
+ (Result) <<= 6; \
+ (Result) |= ((Chars)[(Count)] & 0x3f); \
+ }
+
+#define UNICODE_VALID(Char) \
+ ((Char) < 0x110000 && \
+ (((Char) & 0xFFFFF800) != 0xD800) && \
+ ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
+ ((Char) > 0x20 || (Char) == 0x09 || (Char) == 0x0A || (Char) == 0x0D) && \
+ ((Char) & 0xFFFE) != 0xFFFE)
+
+/**
+ * gnc_utf8_validate (copied from g_utf8_validate):
+ * @str: a pointer to character data
+ * @max_len: max bytes to validate, or -1 to go until nul
+ * @end: return location for end of valid data
+ *
+ * Validates UTF-8 encoded text. @str is the text to validate;
+ * if @str is nul-terminated, then @max_len can be -1, otherwise
+ * @max_len should be the number of bytes to validate.
+ * If @end is non-%NULL, then the end of the valid range
+ * will be stored there (i.e. the address of the first invalid byte
+ * if some bytes were invalid, or the end of the text being validated
+ * otherwise).
+ *
+ * This function looks validates the strict subset of UTF-8 that is
+ * valid XML text, as detailed in
+ * http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535
+ *
+ * Returns %TRUE if all of @str was valid. Many GLib and GTK+
+ * routines <emphasis>require</emphasis> valid UTF-8 as input;
+ * so data read from a file or the network should be checked
+ * with g_utf8_validate() before doing anything else with it.
+ *
+ * Return value: %TRUE if the text was valid UTF-8
+ **/
+static gboolean
+gnc_utf8_validate (const gchar *str,
+ gssize max_len,
+ const gchar **end)
{
- return g_utf8_validate(str, -1, NULL);
+
+ const gchar *p;
+
+ g_return_val_if_fail (str != NULL, FALSE);
+
+ if (end)
+ *end = str;
+
+ p = str;
+
+ while ((max_len < 0 || (p - str) < max_len) && *p)
+ {
+ int i, mask = 0, len;
+ gunichar result;
+ unsigned char c = (unsigned char) *p;
+
+ UTF8_COMPUTE (c, mask, len);
+
+ if (len == -1)
+ break;
+
+ /* check that the expected number of bytes exists in str */
+ if (max_len >= 0 &&
+ ((max_len - (p - str)) < len))
+ break;
+
+ UTF8_GET (result, p, i, mask, len);
+
+ if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
+ break;
+
+ if (result == (gunichar)-1)
+ break;
+
+ if (!UNICODE_VALID (result))
+ break;
+
+ p += len;
+ }
+
+ if (end)
+ *end = p;
+
+ /* See that we covered the entire length if a length was
+ * passed in, or that we ended on a nul if not
+ */
+ if (max_len >= 0 &&
+ p != (str + max_len))
+ return FALSE;
+ else if (max_len < 0 &&
+ *p != '\0')
+ return FALSE;
+ else
+ return TRUE;
}
void
@@ -56,12 +205,12 @@
gchar *end;
gint len;
- if (g_utf8_validate(str, -1, (const gchar **)&end))
+ if (gnc_utf8_validate(str, -1, (const gchar **)&end))
return;
g_warning("Invalid utf8 string: %s", str);
do {
len = strlen(end);
memmove(end, end+1, len); /* shuffle the remainder one byte */
- } while (!g_utf8_validate(str, -1, (const gchar **)&end));
+ } while (!gnc_utf8_validate(str, -1, (const gchar **)&end));
}
Modified: gnucash/trunk/src/core-utils/gnc-glib-utils.h
===================================================================
--- gnucash/trunk/src/core-utils/gnc-glib-utils.h 2006-07-04 20:35:52 UTC (rev 14465)
+++ gnucash/trunk/src/core-utils/gnc-glib-utils.h 2006-07-05 16:04:36 UTC (rev 14466)
@@ -56,19 +56,12 @@
int safe_utf8_collate (const char *str1, const char *str2);
-/** This is a helper function for guile. C code should call
- * g_utf8_validate directly.
- *
- * @param str The string to be validated.
- *
- * @return TRUE if this string is valid utf8. */
-gboolean gnc_utf8_validate (const gchar *str);
-
-
/** Strip any non-utf8 characters from a string. This function
* rewrites the string "in place" instead of allocating and returning
- * a new string. This allows it to operat on strings that are
- * defined as character arrays in a larger data structure.
+ * a new string. This allows it to operate on strings that are
+ * defined as character arrays in a larger data structure. Note that
+ * it also removes some subset of invalid XML characters, too.
+ * See http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535
*
* @param str A pointer to the string to strip of invalid
* characters. */
Modified: gnucash/trunk/src/core-utils/gw-core-utils-spec.scm
===================================================================
--- gnucash/trunk/src/core-utils/gw-core-utils-spec.scm 2006-07-04 20:35:52 UTC (rev 14465)
+++ gnucash/trunk/src/core-utils/gw-core-utils-spec.scm 2006-07-05 16:04:36 UTC (rev 14466)
@@ -55,14 +55,6 @@
(gw:wrap-function
ws
- 'gnc:utf8-validate
- '<gw:bool>
- "gnc_utf8_validate"
- '(((<gw:mchars> caller-owned) program))
- "Validate UTF8 encoded text.")
-
- (gw:wrap-function
- ws
'gnc:utf8-strip-invalid
'<gw:void>
"gnc_utf8_strip_invalid"
More information about the gnucash-changes
mailing list