GnuCash  5.6-150-g038405b370+
gnc-tokenizer.cpp
1 /********************************************************************\
2  * gnc-tokenizer.cpp - base class for converting a text file into a *
3  * two-dimensional vector of strings (table) *
4  * *
5  * Copyright (C) 2015 Geert Janssens <geert@kobaltwit.be> *
6  * *
7  * This program is free software; you can redistribute it and/or *
8  * modify it under the terms of the GNU General Public License as *
9  * published by the Free Software Foundation; either version 2 of *
10  * the License, or (at your option) any later version. *
11  * *
12  * This program is distributed in the hope that it will be useful, *
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15  * GNU General Public License for more details. *
16  * *
17  * You should have received a copy of the GNU General Public License*
18  * along with this program; if not, contact: *
19  * *
20  * Free Software Foundation Voice: +1-617-542-5942 *
21  * 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 *
22  * Boston, MA 02110-1301, USA gnu@gnu.org *
23 \********************************************************************/
24 
25 #include "gnc-tokenizer.hpp"
26 #include "gnc-tokenizer-csv.hpp"
27 #include "gnc-tokenizer-dummy.hpp"
28 #include "gnc-tokenizer-fw.hpp"
29 
30 #include <iostream>
31 #include <fstream> // fstream
32 #include <vector>
33 #include <string>
34 #include <algorithm> // copy
35 #include <iterator> // ostream_operator
36 #include <memory>
37 
38 #include <boost/locale.hpp>
39 #include <boost/algorithm/string.hpp>
40 
41 #include <go-glib-extras.h>
42 #include <glib.h>
43 #include <glib/gstdio.h>
44 
45 std::unique_ptr<GncTokenizer> gnc_tokenizer_factory(GncImpFileFormat fmt)
46 {
47  std::unique_ptr<GncTokenizer> tok(nullptr);
48  switch (fmt)
49  {
50  case GncImpFileFormat::CSV:
51  tok.reset(new GncCsvTokenizer());
52  break;
53  case GncImpFileFormat::FIXED_WIDTH:
54  tok.reset(new GncFwTokenizer());
55  break;
56  default:
57  tok.reset(new GncDummyTokenizer());
58  break;
59  }
60 
61  return tok;
62 }
63 
64 void
65 GncTokenizer::load_file(const std::string& path)
66 {
67  if (path.empty())
68  return;
69 
70  m_imp_file_str = path;
71  char *raw_contents;
72  size_t raw_length;
73  GError *error = nullptr;
74 
75  if (!g_file_get_contents(path.c_str(), &raw_contents, &raw_length, &error))
76  {
77  std::string msg {error->message};
78  g_error_free (error);
79  throw std::ifstream::failure {msg};
80  }
81 
82  m_raw_contents = raw_contents;
83  g_free(raw_contents);
84 
85  // Guess encoding, user can override if needed later on.
86  const char *guessed_enc = NULL;
87  guessed_enc = go_guess_encoding (m_raw_contents.c_str(),
88  m_raw_contents.length(),
89  m_enc_str.empty() ? "UTF-8" : m_enc_str.c_str(),
90  NULL);
91  if (guessed_enc)
92  this->encoding(guessed_enc);
93  else
94  m_enc_str.clear();
95 
96 }
97 
98 const std::string&
99 GncTokenizer::current_file()
100 {
101  return m_imp_file_str;
102 }
103 
104 void
105 GncTokenizer::encoding(const std::string& encoding)
106 {
107  m_enc_str = encoding;
108  m_utf8_contents = boost::locale::conv::to_utf<char>(m_raw_contents, m_enc_str);
109 
110  // While we are converting here, let's also normalize line-endings to "\n"
111  // That's what STL expects by default
112  boost::replace_all (m_utf8_contents, "\r\n", "\n");
113  boost::replace_all (m_utf8_contents, "\r", "\n");
114 }
115 
116 const std::string&
117 GncTokenizer::encoding()
118 {
119  return m_enc_str;
120 }
121 
122 
123 const std::vector<StrVec>&
124 GncTokenizer::get_tokens()
125 {
126  return m_tokenized_contents;
127 }
Class to convert a csv file into vector of string vectors.
Class convert a file into vector of string vectors.
GncImpFileFormat
Enumeration for file formats supported by this importer.
Class convert a file with fixed with delimited contents into vector of string vectors.
Dummy converter class to convert a file into vector of string vectors.