Skip to content

Commit eb0f310

Browse files
authored
Merge pull request #1 from koheiw/remove-map
Remove map
2 parents 9d555aa + a56d642 commit eb0f310

File tree

3 files changed

+83
-52
lines changed

3 files changed

+83
-52
lines changed

src/word2vec/word2vec.cpp

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,24 @@ namespace w2v {
1818
std::shared_ptr<corpus_t> corpus(new corpus_t(_corpus));
1919

2020
m_vectorSize = _settings.size;
21-
m_mapSize = corpus->types.size();
21+
m_vocaburarySize = corpus->types.size();
2222

2323
// train model
24-
std::vector<float> _trainMatrix;
24+
//std::vector<float> _trainMatrix;
2525
trainer_t(std::make_shared<settings_t>(_settings),
2626
corpus,
27-
_trainProgressCallback)(_trainMatrix);
27+
_trainProgressCallback)(m_trainMatrix);
2828

29-
std::size_t wordIndex = 0;
30-
for (auto const &type : corpus->types) {
31-
//Rcpp::Rcout << type << "\n";
32-
auto &vec = m_map[type];
33-
vec.resize(m_vectorSize);
34-
std::copy(&_trainMatrix[wordIndex * m_vectorSize],
35-
&_trainMatrix[(wordIndex + 1) * m_vectorSize],
36-
&vec[0]);
37-
wordIndex++;
38-
}
29+
// std::size_t wordIndex = 0;
30+
// for (auto const &type : corpus->types) {
31+
// //Rcpp::Rcout << type << "\n";
32+
// auto &vec = m_map[type];
33+
// vec.resize(m_vectorSize);
34+
// std::copy(&m_trainMatrix[wordIndex * m_vectorSize],
35+
// &m_trainMatrix[(wordIndex + 1) * m_vectorSize],
36+
// &vec[0]);
37+
// wordIndex++;
38+
// }
3939

4040
return true;
4141
} catch (const std::exception &_e) {

src/word2vec/word2vec.hpp

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,14 @@ namespace w2v {
9595

9696
class word2vec_t final {
9797
protected:
98-
using map_t = std::unordered_map<std::string, std::vector<float>>;
98+
//using map_t = std::unordered_map<std::string, std::vector<float>>;
9999

100-
map_t m_map;
100+
// word vector
101+
std::vector<float> m_trainMatrix; // NOTE: rename to m_layer or m_wordvector?
102+
//map_t m_map;
101103
uint16_t m_vectorSize = 0;
102-
std::size_t m_mapSize = 0;
104+
std::size_t m_vocaburarySize = 0;
105+
//std::size_t m_mapSize = 0;
103106
mutable std::string m_errMsg;
104107

105108
public:
@@ -110,20 +113,22 @@ namespace w2v {
110113
/// type of callback function to be called on training progress events
111114
using trainProgressCallback_t = std::function<void(float, float)>;
112115

113-
public:
114-
115116
/// constructs a model
116-
word2vec_t(): m_map(), m_errMsg() {}
117+
//word2vec_t(): m_map(), m_errMsg() {}
117118
/// virtual destructor
118119
virtual ~word2vec_t() = default;
119120

120121
/// direct access to the word-vector map
121-
const map_t &map() {return m_map;} // NOTE: consider removing
122+
//const map_t &map() {return m_map;} // NOTE: consider removing
123+
124+
const std::vector<float> &trainMatrix() {return m_trainMatrix;}
122125

123126
/// @returns vector size of model
124127
uint16_t vectorSize() const noexcept {return m_vectorSize;}
125128
/// @returns model size (number of stored vectors)
126-
std::size_t modelSize() const noexcept {return m_mapSize;}
129+
//std::size_t modelSize() const noexcept {return m_mapSize;}
130+
/// @returns m_vocaburarySize size (number of types)
131+
std::size_t vocaburarySize() const noexcept {return m_vocaburarySize;}
127132
/// @returns error message
128133
std::string errMsg() const noexcept {return m_errMsg;}
129134

@@ -133,19 +138,35 @@ namespace w2v {
133138
trainProgressCallback_t _trainProgressCallback) noexcept;
134139

135140
/// normalise vectors
141+
// inline void normalize() {
142+
// for(auto &it : m_map) {
143+
// // normalize vector
144+
// auto &vec = it.second;
145+
// float ss = 0.0f;
146+
// for (auto const &v : vec) {
147+
// ss += v * v;
148+
// }
149+
// if (ss <= 0.0f)
150+
// throw std::runtime_error("failed to normalize vectors");
151+
// float d = std::sqrt(ss / vec.size());
152+
// for (auto &v : vec) {
153+
// v = v / d;
154+
// }
155+
// }
156+
// }
157+
158+
// normalise vectors
136159
inline void normalize() {
137-
for(auto &it : m_map) {
138-
// normalize vector
139-
auto &vec = it.second;
160+
for(std::size_t i = 0; i < m_vocaburarySize; i += m_vectorSize) {
140161
float ss = 0.0f;
141-
for (auto const &v : vec) {
142-
ss += v * v;
162+
for(std::size_t j = 0; j < m_vectorSize; ++j) {
163+
ss += m_trainMatrix[i + j] * m_trainMatrix[i + j];
143164
}
144165
if (ss <= 0.0f)
145166
throw std::runtime_error("failed to normalize vectors");
146-
float d = std::sqrt(ss / vec.size());
147-
for (auto &v : vec) {
148-
v = v / d;
167+
float d = std::sqrt(ss / m_vectorSize);
168+
for(std::size_t j = 0; j < m_vectorSize; ++j) {
169+
m_trainMatrix[i + j] = m_trainMatrix[i + j] / d;
149170
}
150171
}
151172
}

src/wordvector.cpp

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
//#include <iomanip>
44
#include <chrono>
55
#include <thread>
6-
#include <unordered_map>
6+
//#include <unordered_map>
77
#include <mutex>
88
#include "word2vec/word2vec.hpp"
99
#include "tokens.h"
@@ -19,29 +19,38 @@ Rcpp::CharacterVector encode(std::vector<std::string> types){
1919
return(types_);
2020
}
2121

22-
Rcpp::NumericMatrix as_matrix(w2v::word2vec_t model) {
23-
24-
std::unordered_map<std::string, std::vector<float>> m_map = model.map();
25-
std::vector<std::string> words;
26-
words.reserve(m_map.size());
27-
for(auto it : m_map) {
28-
words.push_back(it.first);
29-
}
22+
// Rcpp::NumericMatrix as_matrix(w2v::word2vec_t model) {
23+
//
24+
// std::unordered_map<std::string, std::vector<float>> m_map = model.map();
25+
// std::vector<std::string> words;
26+
// words.reserve(m_map.size());
27+
// for(auto it : m_map) {
28+
// words.push_back(it.first);
29+
// }
30+
//
31+
// std::vector<float> mat;
32+
// mat.reserve(model.vectorSize() * words.size());
33+
// for (size_t j = 0; j < words.size(); j++) {
34+
// //auto p = model.vector(words[j]);
35+
// auto it = m_map.find(words[j]);
36+
// if (it != m_map.end()) {
37+
// //std::vector<float> vec = *p;
38+
// std::vector<float> vec = it->second;
39+
// mat.insert(mat.end(), vec.begin(), vec.end());
40+
// }
41+
// }
42+
// //std::vector<float> mat = model.trainMatrix();
43+
//
44+
// Rcpp::NumericMatrix mat_(model.vectorSize(), words.size(), mat.begin());
45+
// colnames(mat_) = encode(words);
46+
// return Rcpp::transpose(mat_);
47+
// }
3048

31-
std::vector<float> mat;
32-
mat.reserve(model.vectorSize() * words.size());
33-
for (size_t j = 0; j < words.size(); j++) {
34-
//auto p = model.vector(words[j]);
35-
auto it = m_map.find(words[j]);
36-
if (it != m_map.end()) {
37-
//std::vector<float> vec = *p;
38-
std::vector<float> vec = it->second;
39-
mat.insert(mat.end(), vec.begin(), vec.end());
40-
}
41-
}
49+
Rcpp::NumericMatrix as_matrix(w2v::word2vec_t model, w2v::corpus_t corpus) {
4250

43-
Rcpp::NumericMatrix mat_(model.vectorSize(), words.size(), mat.begin());
44-
colnames(mat_) = encode(words);
51+
std::vector<float> mat = model.trainMatrix();
52+
Rcpp::NumericMatrix mat_(model.vectorSize(), corpus.types.size(), mat.begin());
53+
colnames(mat_) = encode(corpus.types);
4554
return Rcpp::transpose(mat_);
4655
}
4756

@@ -155,7 +164,8 @@ Rcpp::List cpp_w2v(Rcpp::List texts_,
155164
Rprintf(" ...complete\n");
156165

157166
Rcpp::List out = Rcpp::List::create(
158-
Rcpp::Named("model") = as_matrix(word2vec),
167+
//Rcpp::Named("model") = as_matrix(word2vec),
168+
Rcpp::Named("model") = as_matrix(word2vec, corpus),
159169
//Rcpp::Named("model") = model,
160170
//Rcpp::Named("vocabulary") = types.size(),
161171
//Rcpp::Named("success") = success,

0 commit comments

Comments
 (0)