tf_idf_encoding_policy.hpp
Go to the documentation of this file.
1 
13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP
14 #define MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP
15 
16 #include <mlpack/prereqs.hpp>
19 
20 namespace mlpack {
21 namespace data {
22 
36 {
37  public:
53  enum class TfTypes
54  {
55  BINARY,
56  RAW_COUNT,
59  };
60 
76  const bool smoothIdf = true) :
77  tfType(tfType),
78  smoothIdf(smoothIdf)
79  { }
80 
84  void Reset()
85  {
86  tokensFrequences.clear();
87  numContainingStrings.clear();
88  linesSizes.clear();
89  }
90 
103  template<typename MatType>
104  static void InitMatrix(MatType& output,
105  const size_t datasetSize,
106  const size_t /* maxNumTokens */,
107  const size_t dictionarySize)
108  {
109  output.zeros(dictionarySize, datasetSize);
110  }
111 
126  template<typename ElemType>
127  static void InitMatrix(std::vector<std::vector<ElemType>>& output,
128  const size_t datasetSize,
129  const size_t /* maxNumTokens */,
130  const size_t dictionarySize)
131  {
132  output.resize(datasetSize, std::vector<ElemType>(dictionarySize));
133  }
134 
147  template<typename MatType>
148  void Encode(MatType& output,
149  const size_t value,
150  const size_t line,
151  const size_t /* index */)
152  {
153  const typename MatType::elem_type tf =
154  TermFrequency<typename MatType::elem_type>(
155  tokensFrequences[line][value], linesSizes[line]);
156 
157  const typename MatType::elem_type idf =
158  InverseDocumentFrequency<typename MatType::elem_type>(
159  output.n_cols, numContainingStrings[value]);
160 
161  output(value - 1, line) = tf * idf;
162  }
163 
179  template<typename ElemType>
180  void Encode(std::vector<std::vector<ElemType>>& output,
181  const size_t value,
182  const size_t line,
183  const size_t /* index */)
184  {
185  const ElemType tf = TermFrequency<ElemType>(
186  tokensFrequences[line][value], linesSizes[line]);
187 
188  const ElemType idf = InverseDocumentFrequency<ElemType>(
189  output.size(), numContainingStrings[value]);
190 
191  output[line][value - 1] = tf * idf;
192  }
193 
194  /*
195  * The function calculates the necessary statistics for the purpose
196  * of the tf-idf algorithm during the first pass through the dataset.
197  *
198  * @param line The line number at which the encoding is performed.
199  * @param index The token sequence number in the line.
200  * @param value The encoded token.
201  */
202  void PreprocessToken(const size_t line,
203  const size_t /* index */,
204  const size_t value)
205  {
206  if (line >= tokensFrequences.size())
207  {
208  linesSizes.resize(line + 1);
209  tokensFrequences.resize(line + 1);
210  }
211 
212  tokensFrequences[line][value]++;
213 
214  if (tokensFrequences[line][value] == 1)
215  numContainingStrings[value]++;
216 
217  linesSizes[line]++;
218  }
219 
221  const std::vector<std::unordered_map<size_t, size_t>>&
222  TokensFrequences() const { return tokensFrequences; }
224  std::vector<std::unordered_map<size_t, size_t>>& TokensFrequences()
225  {
226  return tokensFrequences;
227  }
228 
230  const std::unordered_map<size_t, size_t>& NumContainingStrings() const
231  {
232  return numContainingStrings;
233  }
234 
236  std::unordered_map<size_t, size_t>& NumContainingStrings()
237  {
238  return numContainingStrings;
239  }
240 
242  const std::vector<size_t>& LinesSizes() const { return linesSizes; }
244  std::vector<size_t>& LinesSizes() { return linesSizes; }
245 
247  TfTypes TfType() const { return tfType; }
249  TfTypes& TfType() { return tfType; }
250 
252  bool SmoothIdf() const { return smoothIdf; }
254  bool& SmoothIdf() { return smoothIdf; }
255 
259  template<typename Archive>
260  void serialize(Archive& ar, const unsigned int /* version */)
261  {
262  ar & BOOST_SERIALIZATION_NVP(tfType);
263  ar & BOOST_SERIALIZATION_NVP(smoothIdf);
264  }
265 
266  private:
276  template<typename ValueType>
277  ValueType TermFrequency(const size_t numOccurrences,
278  const size_t numTokens)
279  {
280  switch (tfType)
281  {
282  case TfTypes::BINARY:
283  return numOccurrences > 0;
284  case TfTypes::RAW_COUNT:
285  return numOccurrences;
287  return static_cast<ValueType>(numOccurrences) / numTokens;
289  return std::log(static_cast<ValueType>(numOccurrences)) + 1;
290  default:
291  Log::Fatal << "Incorrect term frequency type!";
292  return 0;
293  }
294  }
295 
305  template<typename ValueType>
306  ValueType InverseDocumentFrequency(const size_t totalNumLines,
307  const size_t numOccurrences)
308  {
309  if (smoothIdf)
310  {
311  return std::log(static_cast<ValueType>(totalNumLines + 1) /
312  (1 + numOccurrences)) + 1.0;
313  }
314  else
315  {
316  return std::log(static_cast<ValueType>(totalNumLines) /
317  numOccurrences) + 1.0;
318  }
319  }
320 
321  private:
323  std::vector<std::unordered_map<size_t, size_t>> tokensFrequences;
328  std::unordered_map<size_t, size_t> numContainingStrings;
330  std::vector<size_t> linesSizes;
332  TfTypes tfType;
334  bool smoothIdf;
335 };
336 
343 template<typename TokenType>
346 } // namespace data
347 } // namespace mlpack
348 
349 #endif
std::unordered_map< size_t, size_t > & NumContainingStrings()
Modify the number of containing strings depending on the given token.
TfTypes TfType() const
Return the term frequency type.
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: add_to_po.hpp:21
void Reset()
Clear the necessary internal variables.
static void InitMatrix(MatType &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
static void InitMatrix(std::vector< std::vector< ElemType >> &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
std::vector< size_t > & LinesSizes()
Modify the lines sizes.
The core includes that mlpack expects; standard C++ includes and Armadillo.
void Encode(MatType &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
The class translates a set of strings into numbers using various encoding algorithms.
TfTypes
Enum class used to identify the type of the term frequency statistics.
void serialize(Archive &ar, const unsigned int)
Serialize the class to the given archive.
This class provides a dictionary interface for the purpose of string encoding.
static MLPACK_EXPORT util::PrefixedOutStream Fatal
Prints fatal messages prefixed with [FATAL], then terminates the program.
Definition: log.hpp:90
const std::unordered_map< size_t, size_t > & NumContainingStrings() const
Get the number of containing strings depending on the given token.
TfIdfEncodingPolicy(const TfTypes tfType=TfTypes::RAW_COUNT, const bool smoothIdf=true)
Construct this using the term frequency type and the inverse document frequency type.
TfTypes & TfType()
Modify the term frequency type.
void PreprocessToken(const size_t line, const size_t, const size_t value)
const std::vector< size_t > & LinesSizes() const
Return the lines sizes.
void Encode(std::vector< std::vector< ElemType >> &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
Definition of the TfIdfEncodingPolicy class.
std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences()
Modify token frequencies.
bool & SmoothIdf()
Modify the idf algorithm type (whether it&#39;s smooth or not).
const std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences() const
Return token frequencies.
bool SmoothIdf() const
Determine the idf algorithm type (whether it&#39;s smooth or not).