GnuCash  5.6-150-g038405b370+
gnc-tokenizer-fw.cpp
1 /********************************************************************\
2  * gnc-tokenizer-fw.cpp - takes a file and converts it into a *
3  * two-dimensional vector of strings (table) *
4  * splitting the contents on fixed width *
5  * positions *
6  * *
7  * Copyright (C) 2015 Geert Janssens <geert@kobaltwit.be> *
8  * *
9  * This program is free software; you can redistribute it and/or *
10  * modify it under the terms of the GNU General Public License as *
11  * published by the Free Software Foundation; either version 2 of *
12  * the License, or (at your option) any later version. *
13  * *
14  * This program is distributed in the hope that it will be useful, *
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
17  * GNU General Public License for more details. *
18  * *
19  * You should have received a copy of the GNU General Public License*
20  * along with this program; if not, contact: *
21  * *
22  * Free Software Foundation Voice: +1-617-542-5942 *
23  * 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 *
24  * Boston, MA 02110-1301, USA gnu@gnu.org *
25 \********************************************************************/
26 
27 #include "gnc-tokenizer-fw.hpp"
28 
29 #include <iostream>
30 #include <fstream> // fstream
31 #include <vector>
32 #include <string>
33 #include <algorithm> // copy
34 #include <iterator> // ostream_operator
35 
36 #include <boost/tokenizer.hpp>
37 #include <boost/locale.hpp>
38 #include <boost/algorithm/string.hpp>
39 
40 void
41 GncFwTokenizer::columns(const std::vector<uint32_t>& cols)
42 {
43  m_col_vec = cols;
44 }
45 
46 std::vector<uint32_t>
47 GncFwTokenizer::get_columns()
48 {
49  return m_col_vec;
50 }
51 
52 
53 bool GncFwTokenizer::col_can_delete (uint32_t col_num)
54 {
55  auto last_col = m_col_vec.size() - 1;
56  if (col_num >= last_col)
57  return false;
58  else
59  return true;
60 }
61 
62 void GncFwTokenizer::col_delete (uint32_t col_num)
63 {
64  if (!col_can_delete (col_num))
65  return;
66 
67  m_col_vec[col_num + 1] += m_col_vec[col_num];
68  m_col_vec.erase (m_col_vec.begin() + col_num);
69 }
70 
71 bool GncFwTokenizer::col_can_narrow (uint32_t col_num)
72 {
73  // Can't narrow the last column, it always sticks to the end of the parseable data
74  auto last_col = m_col_vec.size() - 1;
75  if (col_num >= last_col)
76  return false;
77  else
78  return true;
79 }
80 
81 void GncFwTokenizer::col_narrow (uint32_t col_num)
82 {
83  if (!col_can_narrow (col_num))
84  return;
85 
86  m_col_vec[col_num]--;
87  m_col_vec[col_num + 1]++;
88 
89  // Drop column if it has become 0-width now
90  if (m_col_vec[col_num] == 0)
91  m_col_vec.erase (m_col_vec.begin() + col_num);
92 }
93 
94 bool GncFwTokenizer::col_can_widen (uint32_t col_num)
95 {
96  // Can't widen the last column, it always sticks to the end of the parseable data
97  auto last_col = m_col_vec.size() - 1;
98  if (col_num >= last_col)
99  return false;
100  else
101  return true;
102 }
103 
104 void GncFwTokenizer::col_widen (uint32_t col_num)
105 {
106  if (!col_can_widen (col_num))
107  return;
108 
109  m_col_vec[col_num]++;
110  m_col_vec[col_num + 1]--;
111 
112  // Drop next column if it has become 0-width now
113  if (m_col_vec[col_num + 1] == 0)
114  m_col_vec.erase (m_col_vec.begin() + col_num + 1);
115 }
116 
117 bool GncFwTokenizer::col_can_split (uint32_t col_num, uint32_t position)
118 {
119  auto last_col = m_col_vec.size() - 1;
120  if (col_num > last_col)
121  return false;
122 
123  uint32_t col_end = m_col_vec[col_num];
124  if (position < 1 || position >= col_end)
125  return false;
126  else
127  return true;
128 }
129 
130 void GncFwTokenizer::col_split (uint32_t col_num, uint32_t position)
131 {
132  if (col_can_split (col_num, position))
133  {
134  m_col_vec.insert (m_col_vec.begin() + col_num, position);
135  m_col_vec[col_num + 1] -= position;
136  }
137 }
138 
139 
140 void GncFwTokenizer::load_file(const std::string& path)
141 {
142  GncTokenizer::load_file(path);
143 
144  std::string line;
145  m_longest_line = 0;
146  std::istringstream in_stream(m_utf8_contents);
147  while (std::getline (in_stream, line))
148  {
149  if (line.size() > m_longest_line)
150  m_longest_line = line.size();
151 
152  line.clear();
153  }
154 
155  if (m_col_vec.empty())
156  /* Set a sane default for the offsets
157  * That is, assume one column with all the data */
158  m_col_vec.push_back(m_longest_line);
159  else
160  {
161  /* Adjust existing last column(s) so the total column width
162  * equals the width of the longest line
163  * This may mean
164  * - widening the last column to widen to the longest line or
165  * - deleting columns/narrowing the last one to reduce to the longest line
166  */
167  uint32_t total_width = 0;
168  for (auto col_width : m_col_vec)
169  total_width += col_width;
170 
171  if (m_longest_line > total_width)
172  m_col_vec.back() += m_longest_line - total_width;
173  else if (m_longest_line < total_width)
174  {
175  while (total_width - m_col_vec.back() > m_longest_line)
176  col_delete (m_col_vec[m_col_vec.size() - 2]);
177  m_col_vec.back() -= total_width - m_longest_line;
178  }
179  }
180 }
181 
182 /* Fixed width tokenizer uses wide characters internally instead of
183  * narrow (possibly multi-byte) characters. With multi-byte characters
184  * the character offsets are incorrectly interpreted as byte offsets and
185  * multi-byte characters (like the € sign in utf-8) could be inadvertently
186  * split. This doesn't happen with wide characters.
187  */
188 int GncFwTokenizer::tokenize()
189 {
190  using boost::locale::conv::utf_to_utf;
191  using Tokenizer = boost::tokenizer< boost::offset_separator,
192  std::wstring::const_iterator, std::wstring > ;
193 
194  boost::offset_separator sep(m_col_vec.begin(), m_col_vec.end(), false);
195 
196  std::wstring wchar_contents = utf_to_utf<wchar_t>(m_utf8_contents.c_str(),
197  m_utf8_contents.c_str() + m_utf8_contents.size());
198 
199  StrVec vec;
200  std::wstring line;
201 
202  m_tokenized_contents.clear();
203  std::wistringstream in_stream(wchar_contents);
204 
205  while (std::getline (in_stream, line))
206  {
207  Tokenizer tok(line, sep);
208  vec.clear();
209  for (auto token : tok)
210  {
211  auto stripped = boost::trim_copy(token); // strips newlines as well as whitespace
212  auto narrow = utf_to_utf<char>(stripped.c_str(), stripped.c_str()
213  + stripped.size());
214  vec.push_back (narrow);
215  }
216  m_tokenized_contents.push_back(vec);
217  line.clear(); // clear here, next check could fail
218  }
219 
220  return 0;
221 }
Class convert a file with fixed with delimited contents into vector of string vectors.