load_csv.hpp
Go to the documentation of this file.
1 
12 #ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP
13 #define MLPACK_CORE_DATA_LOAD_CSV_HPP
14 
15 #include <boost/spirit/include/qi.hpp>
16 #include <boost/algorithm/string/trim.hpp>
17 
18 #include <mlpack/core.hpp>
19 #include <mlpack/core/util/log.hpp>
20 
21 #include <set>
22 #include <string>
23 
24 #include "extension.hpp"
25 #include "format.hpp"
26 #include "dataset_mapper.hpp"
27 
28 namespace mlpack {
29 namespace data {
30 
36 class LoadCSV
37 {
38  public:
43  LoadCSV(const std::string& file);
44 
54  template<typename T, typename PolicyType>
55  void Load(arma::Mat<T> &inout,
57  const bool transpose = true)
58  {
59  CheckOpen();
60 
61  if (transpose)
62  TransposeParse(inout, infoSet);
63  else
64  NonTransposeParse(inout, infoSet);
65  }
66 
77  template<typename T, typename MapPolicy>
78  void GetMatrixSize(size_t& rows, size_t& cols, DatasetMapper<MapPolicy>& info)
79  {
80  using namespace boost::spirit;
81 
82  // Take a pass through the file. If the DatasetMapper policy requires it,
83  // we will pass everything string through MapString(). This might be useful
84  // if, e.g., the MapPolicy needs to find which dimensions are numeric or
85  // categorical.
86 
87  // Reset to the start of the file.
88  inFile.clear();
89  inFile.seekg(0, std::ios::beg);
90  rows = 0;
91  cols = 0;
92 
93  // First, count the number of rows in the file (this is the dimensionality).
94  std::string line;
95  while (std::getline(inFile, line))
96  {
97  ++rows;
98  }
99  info = DatasetMapper<MapPolicy>(rows);
100 
101  // Now, jump back to the beginning of the file.
102  inFile.clear();
103  inFile.seekg(0, std::ios::beg);
104  rows = 0;
105 
106  while (std::getline(inFile, line))
107  {
108  ++rows;
109  // Remove whitespace from either side.
110  boost::trim(line);
111 
112  if (rows == 1)
113  {
114  // Extract the number of columns.
115  auto findColSize = [&cols](iter_type) { ++cols; };
116  qi::parse(line.begin(), line.end(),
117  stringRule[findColSize] % delimiterRule);
118  }
119 
120  // I guess this is technically a second pass, but that's ok... still the
121  // same idea...
122  if (MapPolicy::NeedsFirstPass)
123  {
124  // In this case we must pass everything we parse to the MapPolicy.
125  auto firstPassMap = [&](const iter_type& iter)
126  {
127  std::string str(iter.begin(), iter.end());
128  boost::trim(str);
129 
130  info.template MapFirstPass<T>(std::move(str), rows - 1);
131  };
132 
133  // Now parse the line.
134  qi::parse(line.begin(), line.end(),
135  stringRule[firstPassMap] % delimiterRule);
136  }
137  }
138  }
139 
150  template<typename T, typename MapPolicy>
151  void GetTransposeMatrixSize(size_t& rows,
152  size_t& cols,
154  {
155  using namespace boost::spirit;
156 
157  // Take a pass through the file. If the DatasetMapper policy requires it,
158  // we will pass everything string through MapString(). This might be useful
159  // if, e.g., the MapPolicy needs to find which dimensions are numeric or
160  // categorical.
161 
162  // Reset to the start of the file.
163  inFile.clear();
164  inFile.seekg(0, std::ios::beg);
165  rows = 0;
166  cols = 0;
167 
168  std::string line;
169  while (std::getline(inFile, line))
170  {
171  ++cols;
172  // Remove whitespace from either side.
173  boost::trim(line);
174 
175  if (cols == 1)
176  {
177  // Extract the number of dimensions.
178  auto findRowSize = [&rows](iter_type) { ++rows; };
179  qi::parse(line.begin(), line.end(),
180  stringRule[findRowSize] % delimiterRule);
181 
182  // Now that we know the dimensionality, initialize the DatasetMapper.
183  info.SetDimensionality(rows);
184  }
185 
186  // If we need to do a first pass for the DatasetMapper, do it.
187  if (MapPolicy::NeedsFirstPass)
188  {
189  size_t dim = 0;
190 
191  // In this case we must pass everything we parse to the MapPolicy.
192  auto firstPassMap = [&](const iter_type& iter)
193  {
194  std::string str(iter.begin(), iter.end());
195  boost::trim(str);
196 
197  info.template MapFirstPass<T>(std::move(str), dim++);
198  };
199 
200  // Now parse the line.
201  qi::parse(line.begin(), line.end(),
202  stringRule[firstPassMap] % delimiterRule);
203  }
204  }
205  }
206 
207  private:
208  using iter_type = boost::iterator_range<std::string::iterator>;
209 
214  void CheckOpen();
215 
222  template<typename T, typename PolicyType>
223  void NonTransposeParse(arma::Mat<T>& inout,
224  DatasetMapper<PolicyType>& infoSet)
225  {
226  using namespace boost::spirit;
227 
228  // Get the size of the matrix.
229  size_t rows, cols;
230  GetMatrixSize<T>(rows, cols, infoSet);
231 
232  // Set up output matrix.
233  inout.set_size(rows, cols);
234  size_t row = 0;
235  size_t col = 0;
236 
237  // Reset file position.
238  std::string line;
239  inFile.clear();
240  inFile.seekg(0, std::ios::beg);
241 
242  auto setCharClass = [&](iter_type const &iter)
243  {
244  std::string str(iter.begin(), iter.end());
245  if (str == "\t")
246  {
247  str.clear();
248  }
249  boost::trim(str);
250 
251  inout(row, col++) = infoSet.template MapString<T>(std::move(str), row);
252  };
253 
254  while (std::getline(inFile, line))
255  {
256  // Remove whitespace from either side.
257  boost::trim(line);
258 
259  // Parse the numbers from a line (ex: 1,2,3,4); if the parser finds a
260  // number it will execute the setNum function.
261  const bool canParse = qi::parse(line.begin(), line.end(),
262  stringRule[setCharClass] % delimiterRule);
263 
264  // Make sure we got the right number of rows.
265  if (col != cols)
266  {
267  std::ostringstream oss;
268  oss << "LoadCSV::NonTransposeParse(): wrong number of dimensions ("
269  << col << ") on line " << row << "; should be " << cols
270  << " dimensions.";
271  throw std::runtime_error(oss.str());
272  }
273 
274  if (!canParse)
275  {
276  std::ostringstream oss;
277  oss << "LoadCSV::NonTransposeParse(): parsing error on line " << col
278  << "!";
279  throw std::runtime_error(oss.str());
280  }
281 
282  ++row; col = 0;
283  }
284  }
285 
292  template<typename T, typename PolicyType>
293  void TransposeParse(arma::Mat<T>& inout, DatasetMapper<PolicyType>& infoSet)
294  {
295  using namespace boost::spirit;
296 
297  // Get matrix size. This also initializes infoSet correctly.
298  size_t rows, cols;
299  GetTransposeMatrixSize<T>(rows, cols, infoSet);
300 
301  // Set the matrix size.
302  inout.set_size(rows, cols);
303 
304  // Initialize auxiliary variables.
305  size_t row = 0;
306  size_t col = 0;
307  std::string line;
308  inFile.clear();
309  inFile.seekg(0, std::ios::beg);
310 
315  auto parseString = [&](iter_type const &iter)
316  {
317  // All parsed values must be mapped.
318  std::string str(iter.begin(), iter.end());
319  boost::trim(str);
320 
321  inout(row, col) = infoSet.template MapString<T>(std::move(str), row);
322  ++row;
323  };
324 
325  while (std::getline(inFile, line))
326  {
327  // Remove whitespace from either side.
328  boost::trim(line);
329 
330  // Reset the row we are looking at. (Remember this is transposed.)
331  row = 0;
332 
333  // Now use boost::spirit to parse the characters of the line;
334  // parseString() will be called when a token is detected.
335  const bool canParse = qi::parse(line.begin(), line.end(),
336  stringRule[parseString] % delimiterRule);
337 
338  // Make sure we got the right number of rows.
339  if (row != rows)
340  {
341  std::ostringstream oss;
342  oss << "LoadCSV::TransposeParse(): wrong number of dimensions (" << row
343  << ") on line " << col << "; should be " << rows << " dimensions.";
344  throw std::runtime_error(oss.str());
345  }
346 
347  if (!canParse)
348  {
349  std::ostringstream oss;
350  oss << "LoadCSV::TransposeParse(): parsing error on line " << col
351  << "!";
352  throw std::runtime_error(oss.str());
353  }
354 
355  // Increment the column index.
356  ++col;
357  }
358  }
359 
361  boost::spirit::qi::rule<std::string::iterator, iter_type()> stringRule;
363  boost::spirit::qi::rule<std::string::iterator, iter_type()> delimiterRule;
364 
366  std::string extension;
368  std::string filename;
370  std::ifstream inFile;
371 };
372 
373 } // namespace data
374 } // namespace mlpack
375 
376 #endif
Auxiliary information for a dataset, including mappings to/from strings (or other types) and the data...
Load the csv file.This class use boost::spirit to implement the parser, please refer to following lin...
Definition: load_csv.hpp:36
void Load(arma::Mat< T > &inout, DatasetMapper< PolicyType > &infoSet, const bool transpose=true)
Load the file into the given matrix with the given DatasetMapper object.
Definition: load_csv.hpp:55
.hpp
Definition: add_to_po.hpp:21
void GetTransposeMatrixSize(size_t &rows, size_t &cols, DatasetMapper< MapPolicy > &info)
Peek at the file to determine the number of rows and columns in the matrix, assuming a transposed mat...
Definition: load_csv.hpp:151
LoadCSV(const std::string &file)
Construct the LoadCSV object on the given file.
void GetMatrixSize(size_t &rows, size_t &cols, DatasetMapper< MapPolicy > &info)
Peek at the file to determine the number of rows and columns in the matrix, assuming a non-transposed...
Definition: load_csv.hpp:78
Include all of the base components required to write mlpack methods, and the main mlpack Doxygen docu...
void SetDimensionality(const size_t dimensionality)
Set the dimensionality of an existing DatasetMapper object.