From 4a81483d1c57729c8bc35845608cedd1a22f6df2 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 6 Dec 2025 12:16:33 +0900 Subject: [PATCH] Do not return document vector from word2vec --- src/wordvector.cpp | 22 ++++++----- tests/testthat/test-word2vec.R | 69 ++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/src/wordvector.cpp b/src/wordvector.cpp index 53f3af4..646d486 100644 --- a/src/wordvector.cpp +++ b/src/wordvector.cpp @@ -192,16 +192,18 @@ Rcpp::List cpp_word2vec(TokensPtr xptr, Rprintf(" ...complete\n"); Rcpp::List values; - if (type == 3) { // dm - values = Rcpp::List::create( - Rcpp::Named("word") = get_words(word2vec), - Rcpp::Named("doc") = get_documents(word2vec) - ); - } else if (type == 4) { // dbow - values = Rcpp::List::create( - Rcpp::Named("doc") = get_documents(word2vec) - ); - } else { // cbow or dbow + if (doc2vec) { + if (type == 4) { // dbow + values = Rcpp::List::create( + Rcpp::Named("doc") = get_documents(word2vec) + ); + } else { // dm + values = Rcpp::List::create( + Rcpp::Named("word") = get_words(word2vec), + Rcpp::Named("doc") = get_documents(word2vec) + ); + } + } else { // cbow, sg, dm values = Rcpp::List::create( Rcpp::Named("word") = get_words(word2vec) ); diff --git a/tests/testthat/test-word2vec.R b/tests/testthat/test-word2vec.R index 97ea3b7..63dde64 100644 --- a/tests/testthat/test-word2vec.R +++ b/tests/testthat/test-word2vec.R @@ -145,6 +145,75 @@ test_that("textmodel_word2vec works", { "textmodel_word2vec does not have the layer for documents" ) + # DM + expect_output( + wov3 <- textmodel_word2vec(toks, dim = 50, iter = 10, min_count = 2, sample = 1, + type = "dm", verbose = TRUE), + "Training distributed memory model with 50 dimensions" + ) + expect_equal( + class(wov3), + c("textmodel_word2vec", "textmodel_wordvector") + ) + expect_true( + wov3$use_ns + ) + expect_identical( + wov3$ns_size, 5L + ) + expect_identical( + wov3$window, 5L + ) + expect_identical( + dim(wov3$values$word), c(5360L, 50L) + ) + expect_null( + wov3$values$doc + ) + expect_identical( + dim(wov3$weights), c(5360L, 50L) + ) + expect_identical( + wov3$sample, 1.0 + ) + expect_equal( + wov3$min_count, 2L + ) + expect_false( + wov3$normalize + ) + expect_identical( + featfreq(dfm_trim(dfm(toks), 2)), + wov3$frequency + ) + expect_true( + wov3$tolower + ) + + expect_output( + print(wov3), + paste( + "", + "Call:", + "textmodel_word2vec(x = toks, dim = 50, type = \"dm\", min_count = 2, ", + " iter = 10, sample = 1, verbose = TRUE)", + "", + "50 dimensions; 5,360 words.", sep = "\n"), fixed = TRUE + ) + expect_equal( + class(expect_output(print(wov3))), + class(wov3) + ) + + expect_equal( + rownames(probability(wov3, c("good", "bad"), layer = "words", mode = "numeric")), + rownames(wov3$values$word) + ) + + expect_error( + probability(wov3, c("good", "bad"), layer = "documents", mode = "numeric"), + "textmodel_word2vec does not have the layer for documents" + ) })