GnuCash  5.6-150-g038405b370+
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
import-parse.cpp
1 /*
2  * import-parse.c -- a generic "parser" API for importers.. Allows importers
3  * to parse dates and numbers, and provides a UI to ask for users to
4  * resolve ambiguities.
5  *
6  * Created by: Derek Atkins <derek@ihtfp.com>
7  * Copyright (c) 2003 Derek Atkins <warlord@MIT.EDU>
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of
12  * the License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, contact:
21  *
22  * Free Software Foundation Voice: +1-617-542-5942
23  * 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652
24  * Boston, MA 02110-1301, USA gnu@gnu.org
25  */
26 
27 #ifdef HAVE_CONFIG_H
28 #include <config.h>
29 #endif
30 
31 #include <glib.h>
32 #include <string.h>
33 
34 /* For regex */
35 #include <sys/types.h>
36 #include <regex.h>
37 
38 #include "gnc-engine.h"
39 #include "gnc-ui-util.h"
40 
41 #include "import-parse.h"
42 
43 static QofLogModule log_module = GNC_MOD_IMPORT;
44 
45 /* numeric regular expressions */
46 static regex_t decimal_radix_regex;
47 static regex_t comma_radix_regex;
48 
49 /* date regular expressions */
50 static regex_t date_regex;
51 static regex_t date_mdy_regex;
52 static regex_t date_ymd_regex;
53 
54 static gboolean regex_compiled = FALSE;
55 
56 /* Set and clear flags in bit-flags */
57 #define import_set_flag(i,f) (i = static_cast<GncImportFormat>(static_cast<int>(i) | static_cast<int>(f)))
58 #define import_clear_flag(i,f) (i = static_cast<GncImportFormat>(static_cast<int>(i) & static_cast<int>(~f)))
59 
60 static void
61 compile_regex(void)
62 {
63  int flags = REG_EXTENDED;
64 
65  /* compile the numeric regular expressions */
66  regcomp(&decimal_radix_regex,
67  "^ *\\$?[+-]?\\$?[0-9]+ *$|^ *\\$?[+-]?\\$?[0-9]?[0-9]?[0-9]?(,[0-9][0-9][0-9])*(\\.[0-9]*)? *$|^ *\\$?[+-]?\\$?[0-9]+\\.[0-9]* *$", flags);
68  regcomp(&comma_radix_regex,
69  "^ *\\$?[+-]?\\$?[0-9]+ *$|^ *\\$?[+-]?\\$?[0-9]?[0-9]?[0-9]?(\\.[0-9][0-9][0-9])*(,[0-9]*)? *$|^ *\\$?[+-]?\\$?[0-9]+,[0-9]* *$", flags);
70 
71  /* compile the date-parsing regular expressions */
72  regcomp(&date_regex,
73  "^ *([0-9]+) *[-/.'] *([0-9]+) *[-/.'] *([0-9]+).*$|^ *([0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]).*$", flags);
74  regcomp(&date_mdy_regex, "([0-9][0-9])([0-9][0-9])([0-9][0-9][0-9][0-9])", flags);
75  regcomp(&date_ymd_regex, "([0-9][0-9][0-9][0-9])([0-9][0-9])([0-9][0-9])", flags);
76 
77  regex_compiled = TRUE;
78 }
79 
80 static gint
81 my_strntol(const char *str, int len)
82 {
83  gint res = 0;
84 
85  g_return_val_if_fail(str, 0);
86  g_return_val_if_fail(len, 0);
87 
88  while (len--)
89  {
90 
91  if (*str < '0' || *str > '9')
92  {
93  str++;
94  continue;
95  }
96 
97  res *= 10;
98  res += *(str++) - '0';
99  }
100  return res;
101 }
102 
103 /*
104  * based on a trio match (matches in spaces 1, 2, and 3), and a list
105  * of possible date formats, return the list of formats that this string
106  * could actually be.
107  */
108 static GncImportFormat
109 check_date_format(const char * str, regmatch_t *match, GncImportFormat fmts)
110 {
111  GncImportFormat res = GNCIF_NONE;
112  int len0 = 0, len1 = 0, len2 = 0;
113  int val0 = 0, val1 = 0, val2 = 0;
114 
115  g_return_val_if_fail(match, res);
116  g_return_val_if_fail(fmts, res);
117 
118  /* Compute the lengths */
119  len0 = match[1].rm_eo - match[1].rm_so;
120  len1 = match[2].rm_eo - match[2].rm_so;
121  len2 = match[3].rm_eo - match[3].rm_so;
122 
123  /* compute the numeric values */
124  val0 = my_strntol(str + match[1].rm_so, len0);
125  val1 = my_strntol(str + match[2].rm_so, len1);
126  val2 = my_strntol(str + match[3].rm_so, len2);
127 
128  /* Filter out the possibilities. Hopefully only one will remain */
129 
130  if (val0 > 12) import_clear_flag(fmts, GNCIF_DATE_MDY);
131  if (val0 > 31) import_clear_flag(fmts, GNCIF_DATE_DMY);
132  if (val0 < 1)
133  {
134  import_clear_flag(fmts, GNCIF_DATE_DMY);
135  import_clear_flag(fmts, GNCIF_DATE_MDY);
136  }
137 
138  if (val1 > 12)
139  {
140  import_clear_flag(fmts, GNCIF_DATE_DMY);
141  import_clear_flag(fmts, GNCIF_DATE_YMD);
142  }
143  if (val1 > 31)
144  {
145  import_clear_flag(fmts, GNCIF_DATE_MDY);
146  import_clear_flag(fmts, GNCIF_DATE_YDM);
147  }
148 
149  if (val2 > 12) import_clear_flag(fmts, GNCIF_DATE_YDM);
150  if (val2 > 31) import_clear_flag(fmts, GNCIF_DATE_YMD);
151  if (val2 < 1)
152  {
153  import_clear_flag(fmts, GNCIF_DATE_YMD);
154  import_clear_flag(fmts, GNCIF_DATE_YDM);
155  }
156 
157  /* if we've got a 4-character year, make sure the value is greater
158  * than 1930 and less than 2100. XXX: be sure to fix this by 2100!
159  */
160  if (len0 == 4 && (val0 < 1930 || val0 > 2100))
161  {
162  import_clear_flag(fmts, GNCIF_DATE_YMD);
163  import_clear_flag(fmts, GNCIF_DATE_YDM);
164  }
165  if (len2 == 4 && (val2 < 1930 || val2 > 2100))
166  {
167  import_clear_flag(fmts, GNCIF_DATE_MDY);
168  import_clear_flag(fmts, GNCIF_DATE_DMY);
169  }
170 
171  /* If the first string has a length of only 1, then it is definitely
172  * not a year (although it could be a month or day).
173  */
174  if (len0 == 1)
175  {
176  import_clear_flag(fmts, GNCIF_DATE_YMD);
177  import_clear_flag(fmts, GNCIF_DATE_YDM);
178  }
179 
180  return fmts;
181 }
182 
183 GncImportFormat
184 gnc_import_test_numeric(const char* str, GncImportFormat fmts)
185 {
186  GncImportFormat res = GNCIF_NONE;
187 
188  g_return_val_if_fail(str, fmts);
189 
190  if (!regex_compiled)
191  compile_regex();
192 
193  if ((fmts & GNCIF_NUM_PERIOD) && !regexec(&decimal_radix_regex, str, 0, NULL, 0))
194  import_set_flag (res, GNCIF_NUM_PERIOD);
195 
196  if ((fmts & GNCIF_NUM_COMMA) && !regexec(&comma_radix_regex, str, 0, NULL, 0))
197  import_set_flag (res, GNCIF_NUM_COMMA);
198 
199  return res;
200 }
201 
202 
203 GncImportFormat
204 gnc_import_test_date(const char* str, GncImportFormat fmts)
205 {
206  regmatch_t match[5];
207  GncImportFormat res = GNCIF_NONE;
208 
209  g_return_val_if_fail(str, fmts);
210  g_return_val_if_fail(strlen(str) > 1, fmts);
211 
212  if (!regex_compiled)
213  compile_regex();
214 
215  if (!regexec(&date_regex, str, 5, match, 0))
216  {
217  if (match[1].rm_so != -1)
218  res = check_date_format(str, match, fmts);
219  else
220  {
221  /* Hmm, it matches XXXXXXXX, but is this YYYYxxxx or xxxxYYYY?
222  * let's try both ways and let the parser check that YYYY is
223  * valid.
224  */
225  #define DATE_LEN 8
226  char temp[DATE_LEN + 1];
227 
228  g_return_val_if_fail(match[4].rm_so != -1, fmts);
229  g_return_val_if_fail(match[4].rm_eo - match[4].rm_so == DATE_LEN, fmts);
230 
231  /* make a temp copy of the XXXXXXXX string */
232  strncpy(temp, str + match[4].rm_so, DATE_LEN);
233  temp[DATE_LEN] = '\0';
234 
235  /* then check it against the ymd or mdy formats, as necessary */
236  if (((fmts & GNCIF_DATE_YDM) || (fmts & GNCIF_DATE_YMD)) &&
237  !regexec(&date_ymd_regex, temp, 4, match, 0))
238  import_set_flag (res, check_date_format (temp, match, fmts));
239 
240  if (((fmts & GNCIF_DATE_DMY) || (fmts & GNCIF_DATE_MDY)) &&
241  !regexec(&date_mdy_regex, temp, 4, match, 0))
242  import_set_flag (res, check_date_format (temp, match, fmts));
243  }
244  }
245 
246  return res;
247 }
248 
249 gboolean
250 gnc_import_parse_numeric(const char* str, GncImportFormat fmt, gnc_numeric *val)
251 {
252  g_return_val_if_fail(str, FALSE);
253  g_return_val_if_fail(val, FALSE);
254  g_return_val_if_fail(fmt, FALSE);
255  g_return_val_if_fail(!(fmt & (fmt - 1)), FALSE);
256 
257  switch (fmt)
258  {
259  case GNCIF_NUM_PERIOD:
260  return xaccParseAmountExtended(str, TRUE, '-', '.', ',', "$+",
261  val, NULL);
262  case GNCIF_NUM_COMMA:
263  return xaccParseAmountExtended(str, TRUE, '-', ',', '.', "$+",
264  val, NULL);
265  default:
266  PERR("invalid format: %d", fmt);
267  return FALSE;
268  }
269 }
270 
271 /* Handle y2k fixes, etc.
272  * obtaining the year "00", "2000", and "19100" all mean the same thing.
273  * output is an integer representing the year in the C.E.
274  */
275 static int
276 fix_year(int y)
277 {
278  /* two-digit numbers less than "70" are interpreted to be post-2000. */
279  if (y < 70)
280  return (y + 2000);
281 
282  /* fix a common bug in printing post-2000 dates as 19100, etc. */
283  if (y > 19000)
284  return (1900 + (y - 19000));
285 
286  /* At this point we just want to make sure that this is a real date.
287  * y _should_ be a 'unix year' (which is the number of years since
288  * 1900), but it _COULD_ be a full date (1999, 2001, etc.). At some
289  * point in the future we can't tell the difference, but are we really
290  * going to care if this code fails in 3802?
291  */
292  if (y < 1902)
293  return (y + 1900);
294 
295  /* y is good as it is */
296  return y;
297 }
298 
299 gboolean
300 gnc_import_parse_date(const char *str, GncImportFormat fmt, time64 *val)
301 {
302  regmatch_t match[5];
303  char temp[9];
304  const char *datestr;
305 
306  int v0 = 0, v1 = 0, v2 = 0;
307  int m = 0, d = 0, y = 0;
308 
309  g_return_val_if_fail(str, FALSE);
310  g_return_val_if_fail(val, FALSE);
311  g_return_val_if_fail(fmt, FALSE);
312  g_return_val_if_fail(!(fmt & (fmt - 1)), FALSE);
313 
314  if (!regexec(&date_regex, str, 5, match, 0))
315  {
316  if (match[1].rm_so != -1)
317  datestr = str;
318  else
319  {
320  /* date is of the form XXXXXXX; save it to a temp string and
321  * split it based on the format, either YYYYaabb or aabbYYYY
322  */
323  g_return_val_if_fail(match[4].rm_so != -1, FALSE);
324  g_return_val_if_fail(match[4].rm_eo - match[4].rm_so == 8, FALSE);
325 
326  strncpy(temp, str + match[4].rm_so, 8);
327  temp[8] = '\0';
328 
329  switch (fmt)
330  {
331  case GNCIF_DATE_DMY:
332  case GNCIF_DATE_MDY:
333  g_return_val_if_fail(!regexec(&date_mdy_regex, temp, 4, match, 0), FALSE);
334  break;
335  case GNCIF_DATE_YMD:
336  case GNCIF_DATE_YDM:
337  g_return_val_if_fail(!regexec(&date_ymd_regex, temp, 4, match, 0), FALSE);
338  break;
339  default:
340  PERR("Invalid date format provided: %d", fmt);
341  return FALSE;
342  }
343  datestr = temp;
344  }
345 
346  /* datestr points to the date string, and match[123] contains the matches. */
347 
348  if (match[1].rm_so == -1 || match[2].rm_so == -1 || match[3].rm_so == -1)
349  {
350  PERR("can't interpret date %s", str);
351  return FALSE;
352  }
353 
354  /* grab the numerics */
355  v0 = my_strntol(datestr + match[1].rm_so, match[1].rm_eo - match[1].rm_so);
356  v1 = my_strntol(datestr + match[2].rm_so, match[2].rm_eo - match[2].rm_so);
357  v2 = my_strntol(datestr + match[3].rm_so, match[3].rm_eo - match[3].rm_so);
358 
359  switch (fmt)
360  {
361  case GNCIF_DATE_DMY:
362  if (v0 > 0 && v0 <= 31 && v1 > 0 && v1 <= 12 && v2 > 0)
363  {
364  d = v0;
365  m = v1;
366  y = v2;
367  }
368  else
369  PERR("format is d/m/y but date is %s", str);
370  break;
371 
372  case GNCIF_DATE_MDY:
373  if (v0 > 0 && v0 <= 12 && v1 > 0 && v1 <= 31 && v2 > 0)
374  {
375  m = v0;
376  d = v1;
377  y = v2;
378  }
379  else
380  PERR("format is m/d/y but date is %s", str);
381  break;
382 
383  case GNCIF_DATE_YMD:
384  if (v0 > 0 && v1 > 0 && v1 <= 12 && v2 > 0 && v2 <= 31)
385  {
386  y = v0;
387  m = v1;
388  d = v2;
389  }
390  else
391  PERR("format is y/m/d but date is %s", str);
392  break;
393 
394  case GNCIF_DATE_YDM:
395  if (v0 > 0 && v1 > 0 && v1 <= 31 && v2 > 0 && v2 <= 12)
396  {
397  y = v0;
398  d = v1;
399  m = v2;
400  }
401  else
402  PERR("format is y/d/m but date is %s", str);
403  break;
404 
405  default:
406  PERR("invalid date format: %d", fmt);
407  }
408 
409  if (!m || !d || !y)
410  return FALSE;
411 
412  y = fix_year(y);
413  *val = gnc_dmy2time64(d, m, y);
414  return TRUE;
415  }
416 
417  return FALSE;
418 }
utility functions for the GnuCash UI
time64 gnc_dmy2time64(gint day, gint month, gint year)
Convert a day, month, and year to a time64, returning the first second of the day.
#define PERR(format, args...)
Log a serious error.
Definition: qoflog.h:244
gboolean xaccParseAmountExtended(const char *in_str, gboolean monetary, gunichar negative_sign, gunichar decimal_point, gunichar group_separator, const char *ignore_list, gnc_numeric *result, char **endstr)
Converts a string to a gnc_numeric.
All type declarations for the whole Gnucash engine.
gint64 time64
Most systems that are currently maintained, including Microsoft Windows, BSD-derived Unixes and Linux...
Definition: gnc-date.h:87