docs/STABLE/gnc-tokenizer-fw_8cpp_source.html

 /********************************************************************\
  * gnc-tokenizer-fw.cpp - takes a file and converts it into a       *
  *                        two-dimensional vector of strings (table) *
  *                        splitting the contents on fixed width     *
  *                        positions                                 *
  *                                                                  *
  * Copyright (C) 2015 Geert Janssens <geert@kobaltwit.be>           *
  *                                                                  *
  * This program is free software; you can redistribute it and/or    *
  * modify it under the terms of the GNU General Public License as   *
  * published by the Free Software Foundation; either version 2 of   *
  * the License, or (at your option) any later version.              *
  *                                                                  *
  * This program is distributed in the hope that it will be useful,  *
  * but WITHOUT ANY WARRANTY; without even the implied warranty of   *
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    *
  * GNU General Public License for more details.                     *
  *                                                                  *
  * You should have received a copy of the GNU General Public License*
  * along with this program; if not, contact:                        *
  *                                                                  *
  * Free Software Foundation           Voice:  +1-617-542-5942       *
  * 51 Franklin Street, Fifth Floor    Fax:    +1-617-542-2652       *
  * Boston, MA  02110-1301,  USA       gnu@gnu.org                   *
 \********************************************************************/

 #include "gnc-tokenizer-fw.hpp"

 #include <iostream>
 #include <fstream>      // fstream
 #include <vector>
 #include <string>
 #include <algorithm>    // copy
 #include <iterator>     // ostream_operator

 #include <boost/tokenizer.hpp>
 #include <boost/locale.hpp>
 #include <boost/algorithm/string.hpp>

 void
 GncFwTokenizer::columns(const std::vector<uint32_t>& cols)
 {
     m_col_vec = cols;
 }

 std::vector<uint32_t>
 GncFwTokenizer::get_columns()
 {
     return m_col_vec;
 }


 bool GncFwTokenizer::col_can_delete (uint32_t col_num)
 {
     auto last_col = m_col_vec.size() - 1;
     if (col_num >= last_col)
         return false;
     else
         return true;
 }

 void GncFwTokenizer::col_delete (uint32_t col_num)
 {
     if (!col_can_delete (col_num))
         return;

     m_col_vec[col_num + 1] += m_col_vec[col_num];
     m_col_vec.erase (m_col_vec.begin() + col_num);
 }

 bool GncFwTokenizer::col_can_narrow (uint32_t col_num)
 {
     // Can't narrow the last column, it always sticks to the end of the parseable data
     auto last_col = m_col_vec.size() - 1;
     if (col_num >= last_col)
         return false;
     else
         return true;
 }

 void GncFwTokenizer::col_narrow (uint32_t col_num)
 {
     if (!col_can_narrow (col_num))
         return;

     m_col_vec[col_num]--;
     m_col_vec[col_num + 1]++;

     // Drop column if it has become 0-width now
     if (m_col_vec[col_num] == 0)
         m_col_vec.erase (m_col_vec.begin() + col_num);
 }

 bool GncFwTokenizer::col_can_widen (uint32_t col_num)
 {
     // Can't widen the last column, it always sticks to the end of the parseable data
     auto last_col = m_col_vec.size() - 1;
     if (col_num >= last_col)
         return false;
     else
         return true;
 }

 void GncFwTokenizer::col_widen (uint32_t col_num)
 {
     if (!col_can_widen (col_num))
         return;

     m_col_vec[col_num]++;
     m_col_vec[col_num + 1]--;

     // Drop next column if it has become 0-width now
     if (m_col_vec[col_num + 1] == 0)
         m_col_vec.erase (m_col_vec.begin() + col_num + 1);
 }

 bool GncFwTokenizer::col_can_split (uint32_t col_num, uint32_t position)
 {
     auto last_col = m_col_vec.size() - 1;
     if (col_num > last_col)
         return false;

     uint32_t col_end = m_col_vec[col_num];
     if (position < 1 || position >= col_end)
         return false;
     else
         return true;
 }

 void GncFwTokenizer::col_split (uint32_t col_num, uint32_t position)
 {
     if (col_can_split (col_num, position))
     {
         m_col_vec.insert (m_col_vec.begin() + col_num, position);
         m_col_vec[col_num + 1] -= position;
     }
 }


 void GncFwTokenizer::load_file(const std::string& path)
 {
     GncTokenizer::load_file(path);

     std::string line;
     m_longest_line = 0;
     std::istringstream in_stream(m_utf8_contents);
     while (std::getline (in_stream, line))
     {
         if (line.size() > m_longest_line)
             m_longest_line = line.size();

         line.clear();
     }

     if (m_col_vec.empty())
         /* Set a sane default for the offsets
          * That is, assume one column with all the data */
         m_col_vec.push_back(m_longest_line);
     else
     {
         /* Adjust existing last column(s) so the total column width
          * equals the width of the longest line
          * This may mean
          * - widening the last column to widen to the longest line or
          * - deleting columns/narrowing the last one to reduce to the longest line
          */
         uint32_t total_width = 0;
         for (auto col_width : m_col_vec)
             total_width += col_width;

         if (m_longest_line > total_width)
             m_col_vec.back() += m_longest_line - total_width;
         else if (m_longest_line < total_width)
         {
             while (total_width - m_col_vec.back() > m_longest_line)
                 col_delete (m_col_vec[m_col_vec.size() - 2]);
             m_col_vec.back() -= total_width - m_longest_line;
         }
     }
 }

 /* Fixed width tokenizer uses wide characters internally instead of
  * narrow (possibly multi-byte) characters. With multi-byte characters
  * the character offsets are incorrectly interpreted as byte offsets and
  * multi-byte characters (like the € sign in utf-8) could be inadvertently
  * split. This doesn't happen with wide characters.
  */
 int GncFwTokenizer::tokenize()
 {
     using boost::locale::conv::utf_to_utf;
     using Tokenizer = boost::tokenizer< boost::offset_separator,
             std::wstring::const_iterator, std::wstring > ;

     boost::offset_separator sep(m_col_vec.begin(), m_col_vec.end(), false);

     std::wstring wchar_contents = utf_to_utf<wchar_t>(m_utf8_contents.c_str(),
         m_utf8_contents.c_str() + m_utf8_contents.size());

     StrVec vec;
     std::wstring line;

     m_tokenized_contents.clear();
     std::wistringstream in_stream(wchar_contents);

     while (std::getline (in_stream, line))
     {
         Tokenizer tok(line, sep);
         vec.clear();
         for (auto token : tok)
         {
             auto stripped = boost::trim_copy(token); // strips newlines as well as whitespace
             auto narrow = utf_to_utf<char>(stripped.c_str(), stripped.c_str()
                 + stripped.size());
             vec.push_back (narrow);
         }
         m_tokenized_contents.push_back(vec);
         line.clear(); // clear here, next check could fail
     }

     return 0;
 }
gnc-tokenizer-fw.hpp
Class convert a file with fixed with delimited contents into vector of string vectors.