gnucash stable: [gnc-datetime] improve CSV date parser with ICU and boost

Christopher Lam clam at code.gnucash.org
Fri Sep 13 23:20:51 EDT 2024


Updated	 via  https://github.com/Gnucash/gnucash/commit/ab641b31 (commit)
	from  https://github.com/Gnucash/gnucash/commit/c7b55d4f (commit)



commit ab641b31f8a9c218419946930fdb651895604ddb
Author: Christopher Lam <christopher.lck at gmail.com>
Date:   Mon Sep 2 00:21:35 2024 +0800

    [gnc-datetime] improve CSV date parser with ICU and boost
    
    1. Add dateformat "Locale" with ICU; uses current locale for date
       parsing. ICU's locale date parser may parse "3 May 2023" or
       "2024年9月13日" (LC_TIME=zh_TW.utf8) and maybe others.
    
    2. Augment d-m-y m-d-y and y-m-d with boost UK/US/ISO parsers. This allows
       CSV import of dates with months as words as "30 Sep 2023" or
       "May 4, 1978" or "2023-Dec-25". Note boost parser cannot recognise
       2-digit years, therefore "30 Sep 24" is invalid.

diff --git a/libgnucash/engine/gnc-datetime.cpp b/libgnucash/engine/gnc-datetime.cpp
index 308ec24e4e..2396d8721c 100644
--- a/libgnucash/engine/gnc-datetime.cpp
+++ b/libgnucash/engine/gnc-datetime.cpp
@@ -29,6 +29,11 @@
 #include <boost/date_time/local_time/local_time.hpp>
 #include <boost/locale.hpp>
 #include <boost/regex.hpp>
+#include <unicode/smpdtfmt.h>
+#include <unicode/locid.h>
+#include <unicode/udat.h>
+#include <unicode/parsepos.h>
+#include <unicode/calendar.h>
 #include <libintl.h>
 #include <locale.h>
 #include <map>
@@ -70,6 +75,8 @@ static const TZ_Ptr utc_zone(new boost::local_time::posix_time_zone("UTC-0"));
 void _set_tzp(TimeZoneProvider& tz);
 void _reset_tzp();
 
+static Date gregorian_date_from_locale_string (const std::string& str);
+
 /* To ensure things aren't overly screwed up by setting the nanosecond clock for boost::date_time. Don't do it, though, it doesn't get us anything and slows down the date/time library. */
 #ifndef BOOST_DATE_TIME_HAS_NANOSECONDS
 static constexpr auto ticks_per_second = INT64_C(1000000);
@@ -78,7 +85,7 @@ static constexpr auto ticks_per_second = INT64_C(1000000000);
 #endif
 
 /* Vector of date formats understood by gnucash and corresponding regex
- * to parse each from an external source
+ * and/or string->gregorian_date to parse each from an external source
  * Note: while the format names are using a "-" as separator, the
  * regexes will accept any of "-/.' " and will also work for dates
  * without separators.
@@ -86,6 +93,7 @@ static constexpr auto ticks_per_second = INT64_C(1000000000);
 const std::vector<GncDateFormat> GncDate::c_formats ({
     GncDateFormat {
         N_("y-m-d"),
+        boost::gregorian::from_string,
         "(?:"                                   // either y-m-d
         "(?<YEAR>[0-9]+)[-/.' ]+"
         "(?<MONTH>[0-9]+)[-/.' ]+"
@@ -98,6 +106,7 @@ const std::vector<GncDateFormat> GncDate::c_formats ({
     },
     GncDateFormat {
         N_("d-m-y"),
+        boost::gregorian::from_uk_string,
         "(?:"                                   // either d-m-y
         "(?<DAY>[0-9]+)[-/.' ]+"
         "(?<MONTH>[0-9]+)[-/.' ]+"
@@ -110,6 +119,7 @@ const std::vector<GncDateFormat> GncDate::c_formats ({
     },
     GncDateFormat {
         N_("m-d-y"),
+        boost::gregorian::from_us_string,
         "(?:"                                   // either m-d-y
         "(?<MONTH>[0-9]+)[-/.' ]+"
         "(?<DAY>[0-9]+)[-/.' ]+"
@@ -145,7 +155,8 @@ const std::vector<GncDateFormat> GncDate::c_formats ({
         "(?<DAY>[0-9]{2})"
         "(?<YEAR>[0-9]+)?"
         ")"
-    }
+    },
+    GncDateFormat { N_("Locale"), gregorian_date_from_locale_string },
 });
 
 /** Private implementation of GncDateTime. See the documentation for that class.
@@ -607,6 +618,65 @@ GncDateTimeImpl::timestamp()
     return str.substr(0, 8) + str.substr(9, 15);
 }
 
+struct ICUResources
+{
+    std::unique_ptr<icu::DateFormat> formatter;
+    std::unique_ptr<icu::Calendar> calendar;
+};
+
+static ICUResources&
+get_icu_resources()
+{
+    static ICUResources rv;
+
+    if (!rv.formatter)
+    {
+        icu::Locale locale;
+        if (auto lc_time_locale = setlocale (LC_TIME, nullptr))
+        {
+            std::string localeStr(lc_time_locale);
+            if (size_t dotPos = localeStr.find('.'); dotPos != std::string::npos)
+                localeStr = localeStr.substr(0, dotPos);
+
+            locale = icu::Locale::createCanonical (localeStr.c_str());
+        }
+
+        rv.formatter.reset(icu::DateFormat::createDateInstance(icu::DateFormat::kDefault, locale));
+        if (!rv.formatter)
+            throw std::invalid_argument("Cannot create date formatter.");
+
+        UErrorCode status = U_ZERO_ERROR;
+        rv.calendar.reset(icu::Calendar::createInstance(locale, status));
+        if (U_FAILURE(status))
+            throw std::invalid_argument("Cannot create calendar instance.");
+
+        rv.calendar->setLenient(false);
+    }
+
+    return rv;
+}
+
+static Date
+gregorian_date_from_locale_string (const std::string& str)
+{
+    ICUResources& resources = get_icu_resources();
+
+    icu::UnicodeString input = icu::UnicodeString::fromUTF8(str);
+    icu::ParsePosition parsePos;
+    UDate date = resources.formatter->parse(input, parsePos);
+    if (parsePos.getErrorIndex() != -1 || parsePos.getIndex() != input.length())
+        throw std::invalid_argument ("Cannot parse string");
+
+    UErrorCode status = U_ZERO_ERROR;
+    resources.calendar->setTime(date, status);
+    if (U_FAILURE(status))
+        throw std::invalid_argument ("Cannot set calendar time");
+
+    return Date (resources.calendar->get(UCAL_YEAR, status),
+                 resources.calendar->get(UCAL_MONTH, status) + 1,
+                 resources.calendar->get(UCAL_DATE, status));
+}
+
 /* Member function definitions for GncDateImpl.
  */
 GncDateImpl::GncDateImpl(const std::string str, const std::string fmt) :
@@ -617,6 +687,19 @@ GncDateImpl::GncDateImpl(const std::string str, const std::string fmt) :
     if (iter == GncDate::c_formats.cend())
         throw std::invalid_argument(N_("Unknown date format specifier passed as argument."));
 
+    if (iter->m_str_to_date)
+    {
+        try
+        {
+            m_greg = (*iter->m_str_to_date)(str);
+            return;
+        }
+        catch (...) {}          // with any string->date exception, try regex
+    }
+
+    if (iter->m_re.empty())
+        throw std::invalid_argument ("No regex pattern available");
+
     boost::regex r(iter->m_re);
     boost::smatch what;
     if(!boost::regex_search(str, what, r))  // regex didn't find a match
diff --git a/libgnucash/engine/gnc-datetime.hpp b/libgnucash/engine/gnc-datetime.hpp
index 77a6039b10..6c150a9222 100644
--- a/libgnucash/engine/gnc-datetime.hpp
+++ b/libgnucash/engine/gnc-datetime.hpp
@@ -29,6 +29,10 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <functional>
+#include <optional>
+
+#include <boost/date_time/gregorian/gregorian.hpp>
 
 typedef struct
 {
@@ -172,6 +176,8 @@ private:
  * GncDate::c_formats class variable and work with those.
  */
 
+using StringToDate = std::function<boost::gregorian::date(const std::string&)>;
+
 class GncDateFormat
 {
 public:
@@ -182,6 +188,10 @@ public:
      */
     GncDateFormat (const char* fmt, const char* re) :
     m_fmt(fmt), m_re(re) {}
+    GncDateFormat (const char* fmt, StringToDate str_to_date, const char* re) :
+        m_fmt(fmt), m_re(re), m_str_to_date(str_to_date) {}
+    GncDateFormat (const char* fmt, StringToDate str_to_date) :
+        m_fmt(fmt), m_str_to_date(str_to_date) {}
     /** A string representing the format. */
     const std::string m_fmt;
 private:
@@ -189,6 +199,7 @@ private:
      * only be used internally by the gnc-datetime code.
      */
     const std::string m_re;
+    std::optional<StringToDate> m_str_to_date;
 
     friend class GncDateImpl;
 };
diff --git a/libgnucash/engine/test/gtest-gnc-datetime.cpp b/libgnucash/engine/test/gtest-gnc-datetime.cpp
index 70b8b1a614..b1d0efaf4c 100644
--- a/libgnucash/engine/test/gtest-gnc-datetime.cpp
+++ b/libgnucash/engine/test/gtest-gnc-datetime.cpp
@@ -89,6 +89,12 @@ TEST(gnc_date_constructors, test_str_format_constructor)
         { "y-m-d",  "1985.3.12", 1985,  3, 12},
         { "y-m-d",      "3'6'8", 2003,  6,  8},
         { "y-m-d",   "20130801", 2013,  8,  1},
+        { "y-m-d", "2013 Aug 1", 2013,  8,  1},
+        { "y-m-d", "2013 Aug 01",2013,  8,  1},
+        { "y-m-d", "2013 August 01",    2013,  8,  1},
+        { "y-m-d", "2013-August-1",     2013,  8,  1},
+        { "y-m-d", "2009/Nov/04",2009, 11,  4},
+        { "y-m-d","1985.Mar.12", 1985,  3, 12},
         { "d-m-y", "01-08-2013", 2013,  8,  1},
         { "d-m-y",  "01-8-2013", 2013,  8,  1},
         { "d-m-y",  "1-08-2013", 2013,  8,  1},
@@ -101,6 +107,9 @@ TEST(gnc_date_constructors, test_str_format_constructor)
         { "d-m-y",  "12.3.1985", 1985,  3, 12},
         { "d-m-y",      "8'6'3", 2003,  6,  8},
         { "d-m-y",   "01082013", 2013,  8,  1},
+        { "d-m-y", "1 Aug 2013", 2013,  8,  1},
+        { "d-m-y", "1 Sep 2013", 2013,  9,  1},
+        { "d-m-y", "1 September 2013",  2013,  9,  1},
         { "m-d-y", "08-01-2013", 2013,  8,  1},
         { "m-d-y",  "8-01-2013", 2013,  8,  1},
         { "m-d-y",  "08-1-2013", 2013,  8,  1},
@@ -113,6 +122,8 @@ TEST(gnc_date_constructors, test_str_format_constructor)
         { "m-d-y",  "3.12.1985", 1985,  3, 12},
         { "m-d-y",      "6'8'3", 2003,  6,  8},
         { "m-d-y",   "08012013", 2013,  8,  1},
+        { "m-d-y", "November 4, 2009",  2009, 11,  4},
+        { "m-d-y", "Nov 4, 2009",       2009, 11,  4},
         {   "d-m",      "01-08",   curr_year,  8,  1},
         {   "d-m",       "01-8",   curr_year,  8,  1},
         {   "d-m",       "1-08",   curr_year,  8,  1},
@@ -130,6 +141,29 @@ TEST(gnc_date_constructors, test_str_format_constructor)
         {   "m-d",        "6'8",   curr_year,  6,  8},
         {   "m-d",       "0801",   curr_year,  8,  1},
 
+        // invalid dates
+        { "d-m-y", "0 Aug 2013",          -1, -1, -1},
+        { "d-m-y", "31 Sep 2013",         -1, -1, -1},
+        { "d-m-y", "31 September 2013",   -1, -1, -1},
+        { "d-m-y", "31/11/2009",          -2, -2, -2},
+        { "d-m-y",  "34.3.1985",          -2, -2, -2},
+        { "m-d-y", "November 41, 2009",   -1, -1, -1},
+        { "m-d-y", "Nov 31, 2009",        -1, -1, -1},
+        { "y-m-d", "2013 Aug 0",          -1, -1, -1},
+        { "y-m-d", "2013 Feb 30",         -1, -1, -1},
+        { "y-m-d", "2013 August 0",       -1, -1, -1},
+        { "y-m-d", "2013-June-31",        -1, -1, -1},
+        { "y-m-d", "2009/Nov/0",          -1, -1, -1},
+        { "y-m-d",  "1985.Mar.32",        -1, -1, -1},
+
+        // 2-digit dates are not parsable with months as words
+        { "d-m-y", "1 Sep 13",            -1, -1, -1},
+        { "d-m-y", "1 September 13",      -1, -1, -1},
+        { "m-d-y", "November 4, 24",      -1, -1, -1},
+        { "m-d-y", "Nov 4, 23",           -1, -1, -1},
+        { "m-d-y", "Nov 29, 24",          -1, -1, -1},
+        { "y-m-d", "13-June-11",          -1, -1, -1},
+
         // ambiguous date formats
         // current parser doesn't know how to disambiguate
         // and hence refuses to parse
@@ -186,6 +220,10 @@ TEST(gnc_date_constructors, test_str_format_constructor)
         {
             got_year = got_month = got_day = -1;
         }
+        catch (const std::out_of_range&)
+        {
+            got_year = got_month = got_day = -2;
+        }
 
         EXPECT_TRUE ((got_year  == test_dates[i].exp_year) &&
                      (got_month == test_dates[i].exp_month) &&



Summary of changes:
 libgnucash/engine/gnc-datetime.cpp            | 87 ++++++++++++++++++++++++++-
 libgnucash/engine/gnc-datetime.hpp            | 11 ++++
 libgnucash/engine/test/gtest-gnc-datetime.cpp | 38 ++++++++++++
 3 files changed, 134 insertions(+), 2 deletions(-)



More information about the gnucash-changes mailing list