tf_idf_encoding_policy.hpp
Go to the documentation of this file.
1 
13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP
14 #define MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP
15 
16 #include <mlpack/prereqs.hpp>
19 
20 namespace mlpack {
21 namespace data {
22 
36 {
37  public:
53  enum class TfTypes
54  {
55  BINARY,
56  RAW_COUNT,
59  };
60 
76  const bool smoothIdf = true) :
77  tfType(tfType),
78  smoothIdf(smoothIdf)
79  { }
80 
84  void Reset()
85  {
86  tokensFrequences.clear();
87  numContainingStrings.clear();
88  linesSizes.clear();
89  }
90 
103  template<typename MatType>
104  static void InitMatrix(MatType& output,
105  const size_t datasetSize,
106  const size_t /* maxNumTokens */,
107  const size_t dictionarySize)
108  {
109  output.zeros(dictionarySize, datasetSize);
110  }
111 
126  template<typename ElemType>
127  static void InitMatrix(std::vector<std::vector<ElemType>>& output,
128  const size_t datasetSize,
129  const size_t /* maxNumTokens */,
130  const size_t dictionarySize)
131  {
132  output.resize(datasetSize, std::vector<ElemType>(dictionarySize));
133  }
134 
147  template<typename MatType>
148  void Encode(MatType& output,
149  const size_t value,
150  const size_t line,
151  const size_t /* index */)
152  {
153  const typename MatType::elem_type tf =
154  TermFrequency<typename MatType::elem_type>(
155  tokensFrequences[line][value], linesSizes[line]);
156 
157  const typename MatType::elem_type idf =
158  InverseDocumentFrequency<typename MatType::elem_type>(
159  output.n_cols, numContainingStrings[value]);
160 
161  output(value - 1, line) = tf * idf;
162  }
163 
179  template<typename ElemType>
180  void Encode(std::vector<std::vector<ElemType>>& output,
181  const size_t value,
182  const size_t line,
183  const size_t /* index */)
184  {
185  const ElemType tf = TermFrequency<ElemType>(
186  tokensFrequences[line][value], linesSizes[line]);
187 
188  const ElemType idf = InverseDocumentFrequency<ElemType>(
189  output.size(), numContainingStrings[value]);
190 
191  output[line][value - 1] = tf * idf;
192  }
193 
194  /*
195  * The function calculates the necessary statistics for the purpose
196  * of the tf-idf algorithm during the first pass through the dataset.
197  *
198  * @param line The line number at which the encoding is performed.
199  * @param index The token sequence number in the line.
200  * @param value The encoded token.
201  */
202  void PreprocessToken(const size_t line,
203  const size_t /* index */,
204  const size_t value)
205  {
206  if (line >= tokensFrequences.size())
207  {
208  linesSizes.resize(line + 1);
209  tokensFrequences.resize(line + 1);
210  }
211 
212  tokensFrequences[line][value]++;
213 
214  if (tokensFrequences[line][value] == 1)
215  numContainingStrings[value]++;
216 
217  linesSizes[line]++;
218  }
219 
221  const std::vector<std::unordered_map<size_t, size_t>>&
222  TokensFrequences() const { return tokensFrequences; }
224  std::vector<std::unordered_map<size_t, size_t>>& TokensFrequences()
225  {
226  return tokensFrequences;
227  }
228 
230  const std::unordered_map<size_t, size_t>& NumContainingStrings() const
231  {
232  return numContainingStrings;
233  }
234 
236  std::unordered_map<size_t, size_t>& NumContainingStrings()
237  {
238  return numContainingStrings;
239  }
240 
242  const std::vector<size_t>& LinesSizes() const { return linesSizes; }
244  std::vector<size_t>& LinesSizes() { return linesSizes; }
245 
247  TfTypes TfType() const { return tfType; }
249  TfTypes& TfType() { return tfType; }
250 
252  bool SmoothIdf() const { return smoothIdf; }
254  bool& SmoothIdf() { return smoothIdf; }
255 
259  template<typename Archive>
260  void serialize(Archive& ar, const uint32_t /* version */)
261  {
262  ar(CEREAL_NVP(tfType));
263  ar(CEREAL_NVP(smoothIdf));
264  }
265 
266  private:
276  template<typename ValueType>
277  ValueType TermFrequency(const size_t numOccurrences,
278  const size_t numTokens)
279  {
280  switch (tfType)
281  {
282  case TfTypes::BINARY:
283  return numOccurrences > 0;
284  case TfTypes::RAW_COUNT:
285  return numOccurrences;
287  return static_cast<ValueType>(numOccurrences) / numTokens;
289  return std::log(static_cast<ValueType>(numOccurrences)) + 1;
290  default:
291  Log::Fatal << "Incorrect term frequency type!";
292  return 0;
293  }
294  }
295 
305  template<typename ValueType>
306  ValueType InverseDocumentFrequency(const size_t totalNumLines,
307  const size_t numOccurrences)
308  {
309  if (smoothIdf)
310  {
311  return std::log(static_cast<ValueType>(totalNumLines + 1) /
312  (1 + numOccurrences)) + 1.0;
313  }
314  else
315  {
316  return std::log(static_cast<ValueType>(totalNumLines) /
317  numOccurrences) + 1.0;
318  }
319  }
320 
321  private:
323  std::vector<std::unordered_map<size_t, size_t>> tokensFrequences;
328  std::unordered_map<size_t, size_t> numContainingStrings;
330  std::vector<size_t> linesSizes;
332  TfTypes tfType;
334  bool smoothIdf;
335 };
336 
343 template<typename TokenType>
346 } // namespace data
347 } // namespace mlpack
348 
349 #endif
std::unordered_map< size_t, size_t > & NumContainingStrings()
Modify the number of containing strings depending on the given token.
TfTypes TfType() const
Return the term frequency type.
Linear algebra utility functions, generally performed on matrices or vectors.
void Reset()
Clear the necessary internal variables.
static void InitMatrix(MatType &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
static void InitMatrix(std::vector< std::vector< ElemType >> &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
std::vector< size_t > & LinesSizes()
Modify the lines sizes.
The core includes that mlpack expects; standard C++ includes, Armadillo, cereal, and a few basic mlpa...
void Encode(MatType &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
The class translates a set of strings into numbers using various encoding algorithms.
TfTypes
Enum class used to identify the type of the term frequency statistics.
This class provides a dictionary interface for the purpose of string encoding.
const std::unordered_map< size_t, size_t > & NumContainingStrings() const
Get the number of containing strings depending on the given token.
TfIdfEncodingPolicy(const TfTypes tfType=TfTypes::RAW_COUNT, const bool smoothIdf=true)
Construct this using the term frequency type and the inverse document frequency type.
TfTypes & TfType()
Modify the term frequency type.
constexpr auto data(Container const &container) noexcept -> decltype(container.data())
Definition: iterator.hpp:79
void PreprocessToken(const size_t line, const size_t, const size_t value)
const std::vector< size_t > & LinesSizes() const
Return the lines sizes.
void Encode(std::vector< std::vector< ElemType >> &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
Definition of the TfIdfEncodingPolicy class.
std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences()
Modify token frequencies.
bool & SmoothIdf()
Modify the idf algorithm type (whether it&#39;s smooth or not).
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
static util::PrefixedOutStream Fatal
Definition: log.hpp:105
const std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences() const
Return token frequencies.
bool SmoothIdf() const
Determine the idf algorithm type (whether it&#39;s smooth or not).