diff --git a/src/dev.h b/src/dev.h new file mode 100644 index 0000000..a157150 --- /dev/null +++ b/src/dev.h @@ -0,0 +1,36 @@ +#include +#include +#include + +#ifndef __DEV__ +#define __DEV__ + +using namespace Rcpp; + +namespace dev{ + + + /* ---- Profiling tools ------------------------- */ + + typedef std::map > Timer; + + inline void start_timer(std::string label, Timer &timer){ + auto now = std::chrono::high_resolution_clock::now(); + timer[label] = now; + } + + inline void stop_timer(std::string label, Timer &timer){ + if (timer.find(label) == timer.end()){ + Rcout << std::left << std::setw(20) << "'" + label + "'"; + Rcout << " is not timed\n"; + }else{ + Rcout << std::left << std::setw(20) << "'" + label + "'"; + Rcout << " "; + auto now = std::chrono::high_resolution_clock::now(); + Rcout << std::chrono::duration(now - timer[label]).count(); + Rcout << " millsec\n"; + } + } +} + +#endif diff --git a/src/wordvector.cpp b/src/wordvector.cpp index c7da912..53f3af4 100644 --- a/src/wordvector.cpp +++ b/src/wordvector.cpp @@ -4,6 +4,7 @@ #include #include "word2vec/word2vec.hpp" #include "tokens.h" +#include "dev.h" typedef XPtr TokensPtr; typedef std::vector vocabulary_t; @@ -20,32 +21,48 @@ Rcpp::CharacterVector encode(std::vector types){ return types_; } +Rcpp::NumericMatrix as_matrix(std::vector mat, + std::size_t nrow, std::size_t ncol) { + + if (mat.size() == 0) + return Rcpp::NumericMatrix(); + if (nrow * ncol != mat.size()) + throw std::runtime_error("Invalid matrix size"); + Rcpp::NumericMatrix mat_(nrow, ncol); + for (std::size_t i = 0; i < nrow; ++i) { + for (std::size_t j = 0; j < ncol; ++j) { + mat_(i, j) = mat[i * ncol + j]; + } + } + return mat_; +} + + +Rcpp::NumericMatrix get_weights(w2v::word2vec_t model) { + std::vector mat = model.weights(); + if (model.vectorSize() * model.vocabularySize() != mat.size()) + throw std::runtime_error("Invalid weight matrix"); + Rcpp::NumericMatrix mat_ = as_matrix(mat, model.vocabularySize(), model.vectorSize()); + rownames(mat_) = encode(model.vocabulary()); + return mat_; +} + Rcpp::NumericMatrix get_words(w2v::word2vec_t model) { std::vector mat = model.values(); if (model.vectorSize() * model.vocabularySize() != mat.size()) throw std::runtime_error("Invalid word matrix"); - Rcpp::NumericMatrix mat_(model.vectorSize(), model.vocabularySize(), mat.begin()); - colnames(mat_) = encode(model.vocabulary()); - return Rcpp::transpose(mat_); + Rcpp::NumericMatrix mat_ = as_matrix(mat, model.vocabularySize(), model.vectorSize()); + rownames(mat_) = encode(model.vocabulary()); + return mat_; } Rcpp::NumericMatrix get_documents(w2v::word2vec_t model) { std::vector mat = model.docValues(); - if (mat.size() == 0) - return Rcpp::NumericMatrix(); if (model.vectorSize() * model.corpusSize() != mat.size()) throw std::runtime_error("Invalid document matrix"); - Rcpp::NumericMatrix mat_(model.vectorSize(), model.corpusSize(), mat.begin()); - return Rcpp::transpose(mat_); -} - -Rcpp::NumericMatrix get_weights(w2v::word2vec_t model) { - std::vector mat = model.weights(); - if (model.vectorSize() * model.vocabularySize() != mat.size()) - throw std::runtime_error("Invalid weight matrix"); - Rcpp::NumericMatrix mat_(model.vectorSize(), model.vocabularySize(), mat.begin()); - colnames(mat_) = encode(model.vocabulary()); - return Rcpp::transpose(mat_); + Rcpp::NumericMatrix mat_ = as_matrix(mat, model.corpusSize(), model.vectorSize()); + // TODO: add document names here + return mat_; } Rcpp::NumericVector get_frequency(w2v::corpus_t corpus) {