AUDIT: r17063 - gnucash/trunk/src - Bug #396665: When any QIF file content is found that is not encoded in UTF-8,

Charles Day cedayiv at cvs.gnucash.org
Wed Apr 2 14:10:36 EDT 2008


Author: cedayiv
Date: 2008-04-02 14:10:35 -0400 (Wed, 02 Apr 2008)
New Revision: 17063
Trac: http://svn.gnucash.org/trac/changeset/17063

Modified:
   gnucash/trunk/src/core-utils/core-utils.i
   gnucash/trunk/src/core-utils/core-utils.scm
   gnucash/trunk/src/core-utils/gnc-glib-utils.c
   gnucash/trunk/src/core-utils/gnc-glib-utils.h
   gnucash/trunk/src/import-export/qif-import/qif-file.scm
Log:
Bug #396665: When any QIF file content is found that is not encoded in UTF-8,
the importer now first attempts to convert it to UTF-8 according to the locale.
If this fails, the offending bytes will be removed from the string as usual.
In addition, the user will now be informed of either of these actions via a
pop-up warning in the GUI. Each occurrence will also be logged.

This changeset also exposes the previously static GnuCash-specific UTF-8
validation C function, gnc_utf8_validate(), and creates a corresponding
Scheme predicate named "gnc-utf8?" for validating strings in this manner.
BP


Modified: gnucash/trunk/src/core-utils/core-utils.i
===================================================================
--- gnucash/trunk/src/core-utils/core-utils.i	2008-04-01 22:07:28 UTC (rev 17062)
+++ gnucash/trunk/src/core-utils/core-utils.i	2008-04-02 18:10:35 UTC (rev 17063)
@@ -23,3 +23,14 @@
 gchar * gnc_utf8_strip_invalid_strdup(const gchar *);
 %newobject gnc_locale_from_utf8;
 gchar * gnc_locale_from_utf8(const gchar *);
+%newobject gnc_locale_to_utf8;
+gchar * gnc_locale_to_utf8(const gchar *);
+%rename ("gnc-utf8?") wrap_gnc_utf8_validate;
+%inline %{
+  /* This helper function wraps gnc_utf8_validate() into a predicate. */
+  gboolean wrap_gnc_utf8_validate(const gchar *);
+  gboolean wrap_gnc_utf8_validate(const gchar * str)
+  {
+    return gnc_utf8_validate(str, -1, 0);
+  }
+%}

Modified: gnucash/trunk/src/core-utils/core-utils.scm
===================================================================
--- gnucash/trunk/src/core-utils/core-utils.scm	2008-04-01 22:07:28 UTC (rev 17062)
+++ gnucash/trunk/src/core-utils/core-utils.scm	2008-04-02 18:10:35 UTC (rev 17063)
@@ -11,8 +11,10 @@
 
 (re-export gnc-is-debugging)
 (re-export g-find-program-in-path)
+(re-export gnc-utf8?)
 (re-export gnc-utf8-strip-invalid-strdup)
 (re-export gnc-locale-from-utf8)
+(re-export gnc-locale-to-utf8)
 (re-export gnc-scm-log-warn)
 (re-export gnc-scm-log-error)
 (re-export gnc-scm-log-msg)

Modified: gnucash/trunk/src/core-utils/gnc-glib-utils.c
===================================================================
--- gnucash/trunk/src/core-utils/gnc-glib-utils.c	2008-04-01 22:07:28 UTC (rev 17062)
+++ gnucash/trunk/src/core-utils/gnc-glib-utils.c	2008-04-02 18:10:35 UTC (rev 17063)
@@ -119,35 +119,10 @@
      ((Char) >= 0x20 || (Char) == 0x09 || (Char) == 0x0A || (Char) == 0x0D) && \
      ((Char) & 0xFFFE) != 0xFFFE)
 
-/**
- * gnc_utf8_validate (copied from g_utf8_validate):
- * @str: a pointer to character data
- * @max_len: max bytes to validate, or -1 to go until nul
- * @end: return location for end of valid data
- * 
- * Validates UTF-8 encoded text. @str is the text to validate;
- * if @str is nul-terminated, then @max_len can be -1, otherwise
- * @max_len should be the number of bytes to validate.
- * If @end is non-%NULL, then the end of the valid range
- * will be stored there (i.e. the address of the first invalid byte
- * if some bytes were invalid, or the end of the text being validated
- * otherwise).
- *
- * This function looks validates the strict subset of UTF-8 that is
- * valid XML text, as detailed in
- * http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535
- *
- * Returns %TRUE if all of @str was valid. Many GLib and GTK+
- * routines <emphasis>require</emphasis> valid UTF-8 as input;
- * so data read from a file or the network should be checked
- * with g_utf8_validate() before doing anything else with it.
- * 
- * Return value: %TRUE if the text was valid UTF-8
- **/
-static gboolean
-gnc_utf8_validate (const gchar  *str,
-                 gssize        max_len,    
-                 const gchar **end)
+gboolean
+gnc_utf8_validate(const gchar  *str,
+                  gssize        max_len,    
+                  const gchar **end)
 {
 
   const gchar *p;
@@ -244,6 +219,21 @@
   return locale_str;
 }
 
+gchar *
+gnc_locale_to_utf8(const gchar* str)
+{
+  gchar *   utf8_str;
+  gsize     bytes_written = 0;
+  GError *  err = NULL;
+
+  /* Convert to UTF-8 from the encoding used in the current locale. */
+  utf8_str = g_locale_to_utf8(str, -1, NULL, &bytes_written, &err);
+  if (err)
+    g_warning("g_locale_to_utf8 failed: %s", err->message);
+
+  return utf8_str;
+}
+
 GList*
 gnc_g_list_map(GList* list, GncGMapFunc fn, gpointer user_data)
 {

Modified: gnucash/trunk/src/core-utils/gnc-glib-utils.h
===================================================================
--- gnucash/trunk/src/core-utils/gnc-glib-utils.h	2008-04-01 22:07:28 UTC (rev 17062)
+++ gnucash/trunk/src/core-utils/gnc-glib-utils.h	2008-04-02 18:10:35 UTC (rev 17063)
@@ -55,6 +55,32 @@
  *  compares after str2. */
 int safe_utf8_collate (const char *str1, const char *str2);
 
+/**
+ * gnc_utf8_validate (copied from g_utf8_validate):
+ * @str: a pointer to character data
+ * @max_len: max bytes to validate, or -1 to go until nul
+ * @end: return location for end of valid data
+ * 
+ * Validates UTF-8 encoded text. @str is the text to validate;
+ * if @str is nul-terminated, then @max_len can be -1, otherwise
+ * @max_len should be the number of bytes to validate.
+ * If @end is non-%NULL, then the end of the valid range
+ * will be stored there (i.e. the address of the first invalid byte
+ * if some bytes were invalid, or the end of the text being validated
+ * otherwise).
+ *
+ * This function looks validates the strict subset of UTF-8 that is
+ * valid XML text, as detailed in
+ * http://www.w3.org/TR/REC-xml/#NT-Char linked from bug #346535
+ *
+ * Returns %TRUE if all of @str was valid. Many GLib and GTK+
+ * routines <emphasis>require</emphasis> valid UTF-8 as input;
+ * so data read from a file or the network should be checked
+ * with g_utf8_validate() before doing anything else with it.
+ * 
+ * Return value: %TRUE if the text was valid UTF-8
+ **/
+gboolean gnc_utf8_validate(const gchar *str, gssize max_len, const gchar **end);
 
 /** Strip any non-utf8 characters from a string.  This function
  *  rewrites the string "in place" instead of allocating and returning
@@ -91,9 +117,22 @@
  * @param str A pointer to a UTF-8 encoded string to be converted.
  *
  * @return A newly allocated string that has to be g_free'd by the
- * caller. */
+ * caller. If an error occurs, NULL is returned. */
 gchar *gnc_locale_from_utf8(const gchar* str);
 
+/** Converts a string to UTF-8 from the encoding used for strings
+ * in the current locale.
+ *
+ * This essentially is a wrapper for g_locale_to_utf8 that can
+ * be swigified for use with Scheme to avoid adding a dependency
+ * for guile-glib.
+ *
+ * @param str A pointer to a string encoded according to locale.
+ *
+ * @return A newly allocated string that has to be g_free'd by the
+ * caller. If an error occurs, NULL is returned. */
+gchar *gnc_locale_to_utf8(const gchar* str);
+
 typedef gpointer (*GncGMapFunc)(gpointer data, gpointer user_data);
 
 /**

Modified: gnucash/trunk/src/import-export/qif-import/qif-file.scm
===================================================================
--- gnucash/trunk/src/import-export/qif-import/qif-file.scm	2008-04-01 22:07:28 UTC (rev 17062)
+++ gnucash/trunk/src/import-export/qif-import/qif-file.scm	2008-04-02 18:10:35 UTC (rev 17063)
@@ -28,6 +28,9 @@
 ;;
 ;;  Suck in all the lines. Don't do any string interpretation,
 ;;  just store the fields "raw".
+;;
+;; FIXME: This function really should be able to return multiple
+;;        errors and warnings rather than a single one.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (define (qif-file:read-file self path ticker-map window)
@@ -87,10 +90,34 @@
                   (set! tag (string-ref line 0))
                   (set! value (substring line 1))
 
-                  ;; If the line doesn't conform to UTF-8, remove any invalid
-                  ;; characters. This could be smarter, perhaps by trying a
-                  ;; a default character set conversion based on the locale.
-                  (set! value (gnc-utf8-strip-invalid-strdup value))
+                  ;; If the line doesn't conform to UTF-8, try a default
+                  ;; character set conversion based on the locale. If that
+                  ;; fails, remove any invalid characters.
+                  (if (not (gnc-utf8? value))
+                      (let ((converted-value (gnc-locale-to-utf8 value)))
+                        (if (or (string=? converted-value "")
+                                (not (gnc-utf8? converted-value)))
+                            (begin
+                              (set! value (gnc-utf8-strip-invalid-strdup value))
+                              (set! return-val
+                                    (list #t (string-append
+                               (_ "This file is not encoded in UTF-8 or ASCII.")
+                               " "
+                               (_ "Some characters have been discarded."))))
+                              (gnc:warn "qif-file:read-file:"
+                                        " stripping invalid characters."
+                                        "\nAfter: [" value "]"))
+                            (begin
+                              (set! return-val
+                                    (list #t (string-append
+                               (_ "This file is not encoded in UTF-8 or ASCII.")
+                               " "
+                               (_ "Some characters have been converted according to your locale."))))
+                              (gnc:warn "qif-file:read-file:"
+                                        " converting characters by locale."
+                                        "\nBefore: [" value "]"
+                                        "\nAfter:  [" converted-value "]")
+                              (set! value converted-value)))))
 
                   (if (eq? tag #\!)
                       ;; The "!" tag has the highest precedence and is used



More information about the gnucash-changes mailing list