Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions src/word2vec/word2vec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,24 @@
std::shared_ptr<corpus_t> corpus(new corpus_t(_corpus));

m_vectorSize = _settings.size;
m_mapSize = corpus->types.size();
m_vocaburarySize = corpus->types.size();

Check warning on line 21 in src/word2vec/word2vec.cpp

View check run for this annotation

Codecov / codecov/patch

src/word2vec/word2vec.cpp#L21

Added line #L21 was not covered by tests

// train model
std::vector<float> _trainMatrix;
//std::vector<float> _trainMatrix;
trainer_t(std::make_shared<settings_t>(_settings),
corpus,
_trainProgressCallback)(_trainMatrix);
_trainProgressCallback)(m_trainMatrix);

Check warning on line 27 in src/word2vec/word2vec.cpp

View check run for this annotation

Codecov / codecov/patch

src/word2vec/word2vec.cpp#L27

Added line #L27 was not covered by tests

std::size_t wordIndex = 0;
for (auto const &type : corpus->types) {
//Rcpp::Rcout << type << "\n";
auto &vec = m_map[type];
vec.resize(m_vectorSize);
std::copy(&_trainMatrix[wordIndex * m_vectorSize],
&_trainMatrix[(wordIndex + 1) * m_vectorSize],
&vec[0]);
wordIndex++;
}
// std::size_t wordIndex = 0;
// for (auto const &type : corpus->types) {
// //Rcpp::Rcout << type << "\n";
// auto &vec = m_map[type];
// vec.resize(m_vectorSize);
// std::copy(&m_trainMatrix[wordIndex * m_vectorSize],
// &m_trainMatrix[(wordIndex + 1) * m_vectorSize],
// &vec[0]);
// wordIndex++;
// }

return true;
} catch (const std::exception &_e) {
Expand Down
53 changes: 37 additions & 16 deletions src/word2vec/word2vec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,14 @@

class word2vec_t final {
protected:
using map_t = std::unordered_map<std::string, std::vector<float>>;
//using map_t = std::unordered_map<std::string, std::vector<float>>;

map_t m_map;
// word vector
std::vector<float> m_trainMatrix; // NOTE: rename to m_layer or m_wordvector?
//map_t m_map;
uint16_t m_vectorSize = 0;
std::size_t m_mapSize = 0;
std::size_t m_vocaburarySize = 0;
//std::size_t m_mapSize = 0;
mutable std::string m_errMsg;

public:
Expand All @@ -110,20 +113,22 @@
/// type of callback function to be called on training progress events
using trainProgressCallback_t = std::function<void(float, float)>;

public:

/// constructs a model
word2vec_t(): m_map(), m_errMsg() {}
//word2vec_t(): m_map(), m_errMsg() {}
/// virtual destructor
virtual ~word2vec_t() = default;

/// direct access to the word-vector map
const map_t &map() {return m_map;} // NOTE: consider removing
//const map_t &map() {return m_map;} // NOTE: consider removing

const std::vector<float> &trainMatrix() {return m_trainMatrix;}

Check warning on line 124 in src/word2vec/word2vec.hpp

View check run for this annotation

Codecov / codecov/patch

src/word2vec/word2vec.hpp#L124

Added line #L124 was not covered by tests

/// @returns vector size of model
uint16_t vectorSize() const noexcept {return m_vectorSize;}
/// @returns model size (number of stored vectors)
std::size_t modelSize() const noexcept {return m_mapSize;}
//std::size_t modelSize() const noexcept {return m_mapSize;}
/// @returns m_vocaburarySize size (number of types)
std::size_t vocaburarySize() const noexcept {return m_vocaburarySize;}
/// @returns error message
std::string errMsg() const noexcept {return m_errMsg;}

Expand All @@ -133,19 +138,35 @@
trainProgressCallback_t _trainProgressCallback) noexcept;

/// normalise vectors
// inline void normalize() {
// for(auto &it : m_map) {
// // normalize vector
// auto &vec = it.second;
// float ss = 0.0f;
// for (auto const &v : vec) {
// ss += v * v;
// }
// if (ss <= 0.0f)
// throw std::runtime_error("failed to normalize vectors");
// float d = std::sqrt(ss / vec.size());
// for (auto &v : vec) {
// v = v / d;
// }
// }
// }

// normalise vectors
inline void normalize() {
for(auto &it : m_map) {
// normalize vector
auto &vec = it.second;
for(std::size_t i = 0; i < m_vocaburarySize; i += m_vectorSize) {

Check warning on line 160 in src/word2vec/word2vec.hpp

View check run for this annotation

Codecov / codecov/patch

src/word2vec/word2vec.hpp#L160

Added line #L160 was not covered by tests
float ss = 0.0f;
for (auto const &v : vec) {
ss += v * v;
for(std::size_t j = 0; j < m_vectorSize; ++j) {
ss += m_trainMatrix[i + j] * m_trainMatrix[i + j];

Check warning on line 163 in src/word2vec/word2vec.hpp

View check run for this annotation

Codecov / codecov/patch

src/word2vec/word2vec.hpp#L162-L163

Added lines #L162 - L163 were not covered by tests
}
if (ss <= 0.0f)
throw std::runtime_error("failed to normalize vectors");
float d = std::sqrt(ss / vec.size());
for (auto &v : vec) {
v = v / d;
float d = std::sqrt(ss / m_vectorSize);
for(std::size_t j = 0; j < m_vectorSize; ++j) {
m_trainMatrix[i + j] = m_trainMatrix[i + j] / d;

Check warning on line 169 in src/word2vec/word2vec.hpp

View check run for this annotation

Codecov / codecov/patch

src/word2vec/word2vec.hpp#L167-L169

Added lines #L167 - L169 were not covered by tests
}
}
}
Expand Down
56 changes: 33 additions & 23 deletions src/wordvector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
//#include <iomanip>
#include <chrono>
#include <thread>
#include <unordered_map>
//#include <unordered_map>
#include <mutex>
#include "word2vec/word2vec.hpp"
#include "tokens.h"
Expand All @@ -19,29 +19,38 @@
return(types_);
}

Rcpp::NumericMatrix as_matrix(w2v::word2vec_t model) {

std::unordered_map<std::string, std::vector<float>> m_map = model.map();
std::vector<std::string> words;
words.reserve(m_map.size());
for(auto it : m_map) {
words.push_back(it.first);
}
// Rcpp::NumericMatrix as_matrix(w2v::word2vec_t model) {
//
// std::unordered_map<std::string, std::vector<float>> m_map = model.map();
// std::vector<std::string> words;
// words.reserve(m_map.size());
// for(auto it : m_map) {
// words.push_back(it.first);
// }
//
// std::vector<float> mat;
// mat.reserve(model.vectorSize() * words.size());
// for (size_t j = 0; j < words.size(); j++) {
// //auto p = model.vector(words[j]);
// auto it = m_map.find(words[j]);
// if (it != m_map.end()) {
// //std::vector<float> vec = *p;
// std::vector<float> vec = it->second;
// mat.insert(mat.end(), vec.begin(), vec.end());
// }
// }
// //std::vector<float> mat = model.trainMatrix();
//
// Rcpp::NumericMatrix mat_(model.vectorSize(), words.size(), mat.begin());
// colnames(mat_) = encode(words);
// return Rcpp::transpose(mat_);
// }

std::vector<float> mat;
mat.reserve(model.vectorSize() * words.size());
for (size_t j = 0; j < words.size(); j++) {
//auto p = model.vector(words[j]);
auto it = m_map.find(words[j]);
if (it != m_map.end()) {
//std::vector<float> vec = *p;
std::vector<float> vec = it->second;
mat.insert(mat.end(), vec.begin(), vec.end());
}
}
Rcpp::NumericMatrix as_matrix(w2v::word2vec_t model, w2v::corpus_t corpus) {

Check warning on line 49 in src/wordvector.cpp

View check run for this annotation

Codecov / codecov/patch

src/wordvector.cpp#L49

Added line #L49 was not covered by tests

Rcpp::NumericMatrix mat_(model.vectorSize(), words.size(), mat.begin());
colnames(mat_) = encode(words);
std::vector<float> mat = model.trainMatrix();
Rcpp::NumericMatrix mat_(model.vectorSize(), corpus.types.size(), mat.begin());
colnames(mat_) = encode(corpus.types);

Check warning on line 53 in src/wordvector.cpp

View check run for this annotation

Codecov / codecov/patch

src/wordvector.cpp#L51-L53

Added lines #L51 - L53 were not covered by tests
return Rcpp::transpose(mat_);
}

Expand Down Expand Up @@ -155,7 +164,8 @@
Rprintf(" ...complete\n");

Rcpp::List out = Rcpp::List::create(
Rcpp::Named("model") = as_matrix(word2vec),
//Rcpp::Named("model") = as_matrix(word2vec),
Rcpp::Named("model") = as_matrix(word2vec, corpus),

Check warning on line 168 in src/wordvector.cpp

View check run for this annotation

Codecov / codecov/patch

src/wordvector.cpp#L168

Added line #L168 was not covered by tests
//Rcpp::Named("model") = model,
//Rcpp::Named("vocabulary") = types.size(),
//Rcpp::Named("success") = success,
Expand Down
Loading