From 413263e0b729a3e3cd7c2736fc9a8f7a808326fd Mon Sep 17 00:00:00 2001 From: Xixuan Zhang <119933243+xixuanzhang2022@users.noreply.github.com> Date: Thu, 30 Jan 2025 19:12:35 +0100 Subject: [PATCH 1/9] add GloVe implementation GloVe implementation based on https://github.com/roamanalytics/mittens/tree/master using systemds --- scripts/builtin/glove.dml | 117 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 scripts/builtin/glove.dml diff --git a/scripts/builtin/glove.dml b/scripts/builtin/glove.dml new file mode 100644 index 00000000000..fb0fbf3f200 --- /dev/null +++ b/scripts/builtin/glove.dml @@ -0,0 +1,117 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# The implementation is based on +# (1) https://github.com/roamanalytics/mittens/blob/master/mittens/ +# (2) https://github.com/stanfordnlp/GloVe/blob/master/src/glove.c +#------------------------------------------------------------- + +source("scripts/builtin/cooccur.dml") as cooc + +init = function(matrix[double] cooc_matrix, double x_max, double alpha) + return(matrix[double] weights, matrix[double] log_cooc_matrix){ + E = 2.718281828; + bounded = pmin(cooc_matrix, x_max); + weights = (bounded / x_max) ^ alpha; + log_cooc_matrix = ifelse(cooc_matrix > 0, log(cooc_matrix, E), 0); +} + +gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_index, int seed, int vector_size, double alpha, double eta, double x_max, int iterations,int print_loss_it) + return (frame[Unknown] G){ + /* + * Computes vector embeddings for words based on their co-occurrence statistics in a large text corpus. + * + * Parameters: + * - cooc_matrix: Precomputed co-occurrence matrix of shape (N, N). + * - cooc_index: Index file mapping words to their positions in the co-occurrence matrix. + * The second column should contain the word list in the same order as the matrix. + * - seed: Random seed for reproducibility. + * - vector_size: Dimensionality of word vectors (V). + * - eta: Learning rate for optimization. + * - alpha: Weighting function parameter. + * - x_max: Maximum co-occurrence value as per the GloVe paper. + * - iterations: Total number of training iterations. + * - print_loss_it: Interval (in iterations) for printing the loss. + * + * Returns: + * - G: A data structure containing word indices and their corresponding word vectors of shape (N, V), + * where each row represents a word vector of shape V. + */ + + # initialize + vocab_size = nrow(cooc_matrix); + W = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed)-0.5)/vector_size; + C = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed)-0.5)/vector_size; + bw = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+1)-0.5)/vector_size; + bc = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+1)-0.5)/vector_size; + [weights, log_cooc_matrix] = init(cooc_matrix, x_max, alpha); + + momentum_W = 0.1 * matrix(1, nrow(W), ncol(W)); + momentum_C = 0.1 * matrix(1, nrow(C), ncol(C)); + momentum_bw = 0.1 * matrix(1, nrow(bw), ncol(bw)); + momentum_bc = 0.1 * matrix(1, nrow(bc), ncol(bc)); + + error = 0; + + for (iter in 1:iterations) { + + # compute predictions for all co-occurring word pairs at once + predictions = W %*% t(C) + bw + t(bc); + diffs = predictions - log_cooc_matrix; + weighted_diffs = weights * diffs; + + # compute gradients + wgrad = weighted_diffs %*% C; + cgrad = t(weighted_diffs) %*% W; + bwgrad = rowSums(weighted_diffs); + bcgrad = matrix(colSums(weighted_diffs), nrow(bc), ncol(bc)); + + error = sum(0.5 * (weights * (diffs ^ 2))); + + # get steps and update + momentum_W = momentum_W + (wgrad ^ 2); + momentum_C = momentum_C + (cgrad ^ 2); + momentum_bw = momentum_bw + (bwgrad ^ 2); + momentum_bc = momentum_bc + (bcgrad ^ 2); + + W = W - (eta * wgrad / sqrt(momentum_W)); + C = C - (eta * cgrad / sqrt(momentum_C)); + bw = bw - (eta * bwgrad / sqrt(momentum_bw)); + bc = bc - (eta * bcgrad / sqrt(momentum_bc)); + + G = W + C; + + if (iter - floor(iter / print_loss_it) * print_loss_it == 0) { + print("iteration: " + iter + " error: " + error); + } + } + + # add the word index to the word vectors + G = cbind(cooc_index[,2], as.frame(G)); +} + +glove = function(Frame[Unknown] input, int seed, int vector_size, double alpha, double eta, double x_max, int iterations,int print_loss_it) + return (frame[Unknown] G){ + [cooc_matrix, cooc_index] = cooc::getCoocMatrix(input, 26000, 15, TRUE,TRUE); + G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, iterations, print_loss_it); + +} + +#input = read(§1, data_type="frame", format="csv", sep=","); +#G = glove(input, seed, vector_size, alpha, eta, x_max, iterations, print_loss_it); From 919198c1a5ad0fe8b88d5a65175d841649618ae1 Mon Sep 17 00:00:00 2001 From: Xixuan Zhang <119933243+xixuanzhang2022@users.noreply.github.com> Date: Mon, 3 Feb 2025 13:49:49 +0100 Subject: [PATCH 2/9] [SYSTEMDS-3179] Update glove.dml Improve the gloveWithCoocMatrix function by incorporating an epsilon value for initialization and implementing a tolerance threshold to mitigate overfitting. --- scripts/builtin/glove.dml | 86 ++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 33 deletions(-) diff --git a/scripts/builtin/glove.dml b/scripts/builtin/glove.dml index fb0fbf3f200..38eb2b3f9cf 100644 --- a/scripts/builtin/glove.dml +++ b/scripts/builtin/glove.dml @@ -28,48 +28,51 @@ init = function(matrix[double] cooc_matrix, double x_max, double alpha) return(matrix[double] weights, matrix[double] log_cooc_matrix){ E = 2.718281828; bounded = pmin(cooc_matrix, x_max); - weights = (bounded / x_max) ^ alpha; + weights = pmin(1, (bounded / x_max) ^ alpha); log_cooc_matrix = ifelse(cooc_matrix > 0, log(cooc_matrix, E), 0); } -gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_index, int seed, int vector_size, double alpha, double eta, double x_max, int iterations,int print_loss_it) +gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_index, int seed, int vector_size, double alpha, double eta, double x_max, double tol, int iterations,int print_loss_it) return (frame[Unknown] G){ /* - * Computes vector embeddings for words based on their co-occurrence statistics in a large text corpus. + * Computes the vector embeddings for words by analyzing their co-occurrence statistics in a large text corpus. * - * Parameters: + * Inputs: * - cooc_matrix: Precomputed co-occurrence matrix of shape (N, N). * - cooc_index: Index file mapping words to their positions in the co-occurrence matrix. * The second column should contain the word list in the same order as the matrix. * - seed: Random seed for reproducibility. - * - vector_size: Dimensionality of word vectors (V). - * - eta: Learning rate for optimization. - * - alpha: Weighting function parameter. - * - x_max: Maximum co-occurrence value as per the GloVe paper. + * - vector_size: Dimensionality of word vectors, V. + * - eta: Learning rate for optimization, recommended value: 0.05. + * - alpha: Weighting function parameter, recommended value: 0.75. + * - x_max: Maximum co-occurrence value as per the GloVe paper: 100. + * - tol: Tolerance value to avoid overfitting, recommended value: 1e-4. * - iterations: Total number of training iterations. * - print_loss_it: Interval (in iterations) for printing the loss. * - * Returns: - * - G: A data structure containing word indices and their corresponding word vectors of shape (N, V), - * where each row represents a word vector of shape V. + * Outputs: + * - G: frame of the word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V) */ - # initialize vocab_size = nrow(cooc_matrix); W = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed)-0.5)/vector_size; - C = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed)-0.5)/vector_size; - bw = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+1)-0.5)/vector_size; - bc = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+1)-0.5)/vector_size; + C = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed+1)-0.5)/vector_size; + bw = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+2)-0.5)/vector_size; + bc = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+3)-0.5)/vector_size; [weights, log_cooc_matrix] = init(cooc_matrix, x_max, alpha); - momentum_W = 0.1 * matrix(1, nrow(W), ncol(W)); - momentum_C = 0.1 * matrix(1, nrow(C), ncol(C)); - momentum_bw = 0.1 * matrix(1, nrow(bw), ncol(bw)); - momentum_bc = 0.1 * matrix(1, nrow(bc), ncol(bc)); + momentum_W = 1e-8 + 0.1 * matrix(1, nrow(W), ncol(W)); + momentum_C = 1e-8 + 0.1 * matrix(1, nrow(C), ncol(C)); + momentum_bw = 1e-8 + 0.1 * matrix(1, nrow(bw), ncol(bw)); + momentum_bc = 1e-8 + 0.1 * matrix(1, nrow(bc), ncol(bc)); error = 0; + iter = 0; + tolerance = tol; + previous_error = 1e10; + conti = TRUE; - for (iter in 1:iterations) { + while (conti) { # compute predictions for all co-occurring word pairs at once predictions = W %*% t(C) + bw + t(bc); @@ -83,19 +86,34 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_i bcgrad = matrix(colSums(weighted_diffs), nrow(bc), ncol(bc)); error = sum(0.5 * (weights * (diffs ^ 2))); + iter = iter + 1; - # get steps and update - momentum_W = momentum_W + (wgrad ^ 2); - momentum_C = momentum_C + (cgrad ^ 2); - momentum_bw = momentum_bw + (bwgrad ^ 2); - momentum_bc = momentum_bc + (bcgrad ^ 2); - W = W - (eta * wgrad / sqrt(momentum_W)); - C = C - (eta * cgrad / sqrt(momentum_C)); - bw = bw - (eta * bwgrad / sqrt(momentum_bw)); - bc = bc - (eta * bcgrad / sqrt(momentum_bc)); + if (abs(previous_error - error) >= tolerance) { + if(iter <= iterations){ - G = W + C; + # get steps and update + momentum_W = momentum_W + (wgrad ^ 2); + momentum_C = momentum_C + (cgrad ^ 2); + momentum_bw = momentum_bw + (bwgrad ^ 2); + momentum_bc = momentum_bc + (bcgrad ^ 2); + + W = W - (eta * wgrad / (sqrt(momentum_W) + 1e-8)); + C = C - (eta * cgrad / (sqrt(momentum_C) + 1e-8)); + bw = bw - (eta * bwgrad / (sqrt(momentum_bw) + 1e-8)); + bc = bc - (eta * bcgrad / (sqrt(momentum_bc) + 1e-8)); + + G = W + C; + + previous_error = error; + + final_iter = iter; + } else { + conti = FALSE; + } + } else { + conti = FALSE; + } if (iter - floor(iter / print_loss_it) * print_loss_it == 0) { print("iteration: " + iter + " error: " + error); @@ -103,15 +121,17 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_i } # add the word index to the word vectors + print("Given " + iterations + " iterations, " + "stopped (or converged) at the " + final_iter + " iteration / error: " + error); G = cbind(cooc_index[,2], as.frame(G)); } -glove = function(Frame[Unknown] input, int seed, int vector_size, double alpha, double eta, double x_max, int iterations,int print_loss_it) + +glove = function(Frame[Unknown] input, int seed, int vector_size, double alpha, double eta, double x_max, double tol, int iterations,int print_loss_it) return (frame[Unknown] G){ [cooc_matrix, cooc_index] = cooc::getCoocMatrix(input, 26000, 15, TRUE,TRUE); - G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, iterations, print_loss_it); + G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); } #input = read(§1, data_type="frame", format="csv", sep=","); -#G = glove(input, seed, vector_size, alpha, eta, x_max, iterations, print_loss_it); +#G = glove(input, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); From efeadc886f76355dbe3228d88e6f43353ab2e8e7 Mon Sep 17 00:00:00 2001 From: Xixuan Zhang <119933243+xixuanzhang2022@users.noreply.github.com> Date: Mon, 3 Feb 2025 14:18:59 +0100 Subject: [PATCH 3/9] [SYSTEMDS-3179] add script to test GloVe Implement cosine similarity and accuracy computation for word embeddings - Added `cosine_similarity` function to compute pairwise cosine similarity between word embeddings. - Implemented `get_top` function to retrieve the top-k most similar word embeddings for each word. - Created `accuracy` function to evaluate the overlap of top-k nearest neighbors between two sets of word embeddings. - Utilized matrix operations for efficient computation of similarity scores. This implementation aids in evaluating the GloVe models by measuring similarity and accuracy. --- scripts/builtin/gloveValidityTest.dml | 124 ++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 scripts/builtin/gloveValidityTest.dml diff --git a/scripts/builtin/gloveValidityTest.dml b/scripts/builtin/gloveValidityTest.dml new file mode 100644 index 00000000000..3b254a1bdfc --- /dev/null +++ b/scripts/builtin/gloveValidityTest.dml @@ -0,0 +1,124 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +cosine_similarity = function(matrix[double] trained_emb) + return (matrix[double] cosine_sim){ + /* + * Computes cosine similarity between word embeddings. + * + * Inputs: + * - trained_emb: Matrix of word embeddings of shape (N, D), where N is the number of words and D is the embedding dimension. + * + * Outputs: + * - cosine_sim: Matrix of cosine similarity scores between word embeddings, shape (N, N). + */ + dot_product = trained_emb %*% t(trained_emb); + row_norms = rowSums(trained_emb^2) ^ 0.5; + denominator = row_norms %*% t(row_norms); + cosine_sim = dot_product / denominator; +} + +get_top = function(matrix[double] trained_emb, int k) + return (matrix[double] topN_indices, matrix[double] topN_values){ + /* + * Retrieves the top k most similar word vectors for each word. + * + * Inputs: + * - trained_emb: Matrix of word embeddings of shape (N, D). + * - k: Number of top similar words to retrieve. + * + * Outputs: + * - topN_indices: Matrix containing indices of the top k most similar words for each word, shape (N, k). + * - topN_values: Matrix containing similarity values of the top k most similar words for each word, shape (N, k). + */ + S = cosine_similarity(trained_emb); + n = nrow(S); + I = diag(matrix(1, rows=nrow(S), cols=1)); + S = S * (1 - I) + (-1.0 * I); + + # Get the sorted indices (descending order) of each row + topN_indices = matrix(0, rows=n, cols=k); + topN_values = matrix(0, rows=n, cols=k); + + for (i in 1:n){ + Scol = S[,i]; + topN = order(target=Scol, by=1, decreasing=TRUE, index.return=TRUE); + topNval = order(target=Scol, by=1, decreasing=TRUE, index.return=FALSE); + + topN = topN[1:k,]; + topv = topNval[1:k,]; + + topN_indices[i,] = t(topN); + topN_values[i,] = t(topv); + } +} + +accuracy = function(frame[unknown] X, frame[unknown] Y, int k) + return (double acc){ + /* + * Computes accuracy based on the overlap of the top-k nearest neighbors between two sets of word embeddings. + * + * Inputs: + * - X: First data frame containing a word column and embedding columns (shape: N x (D+1)). + * - Y: Second data frame containing a word column and embedding columns (shape: N x (D+1)). + * + * Outputs: + * - acc: Scalar value representing the accuracy as the fraction of words where the top-k nearest neighbors match between X and Y. + */ + + # Extract only the embedding values, excluding the word column + X = as.matrix(X[,2:ncol(X)]); + Y = as.matrix(Y[,2:ncol(Y)]); + + [A, Aval] = get_top(X, k); + [B, Bval] = get_top(Y, k); + + count = 0; + for (i in 1:nrow(A)){ + # Select a specific row from both matrices (change row indices as needed) + row1 = A[i,]; # First row of A (1 x m) + row2 = B[i,]; # First row of B (1 x n) + + # Expand both rows to create a pairwise comparison matrix + row1_expanded = t(row1) %*% matrix(1, rows=1, cols=ncol(row2)); # (m x n) + row2_expanded = matrix(1, rows=ncol(row1), cols=1) %*% row2; # (m x n) + + # Element-wise comparison to find matches + matches = (row1_expanded == row2_expanded); # (m x n) Boolean matrix + + # Reduce to find unique elements that exist in both rows + overlap_mask = rowSums(matches) > 0; # (1 x m) mask of matched elements + common_elements = row1 * t(overlap_mask); # Retain only matching elements + + # Remove zeros (non-matching elements set to 0) + filtered_common_elements = removeEmpty(target=common_elements, margin="cols"); + + if (as.scalar(filtered_common_elements[1,1])!= 0){ + count = count+1; + } + } + acc = count/nrow(A); #output +} + +#X = read($1, format="csv", sep = " ", header=FALSE, data_type="frame"); +#Y = read($2, format="csv", sep = " ", header=FALSE, data_type="frame"); +#k = 10; +#accuracy(X,Y,k); From 701d3042304ff201484a8348db7550fbadd26a47 Mon Sep 17 00:00:00 2001 From: Samin <69201742+saminbassiri@users.noreply.github.com> Date: Tue, 4 Feb 2025 07:16:20 +0100 Subject: [PATCH 4/9] Update glove.dml Change glove function to be compatible with occurrence Matrix script --- scripts/builtin/glove.dml | 48 +++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/scripts/builtin/glove.dml b/scripts/builtin/glove.dml index 38eb2b3f9cf..cca44c11a73 100644 --- a/scripts/builtin/glove.dml +++ b/scripts/builtin/glove.dml @@ -22,7 +22,7 @@ # (2) https://github.com/stanfordnlp/GloVe/blob/master/src/glove.c #------------------------------------------------------------- -source("scripts/builtin/cooccur.dml") as cooc +source("scripts/builtin/cooccurrenceMatrix.dml") as cooc init = function(matrix[double] cooc_matrix, double x_max, double alpha) return(matrix[double] weights, matrix[double] log_cooc_matrix){ @@ -125,13 +125,47 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_i G = cbind(cooc_index[,2], as.frame(G)); } - -glove = function(Frame[Unknown] input, int seed, int vector_size, double alpha, double eta, double x_max, double tol, int iterations,int print_loss_it) +glove = function( + Frame[Unknown] input, + int seed, int vector_size, + double alpha, double eta, + double x_max, + double tol, + int iterations, + int print_loss_it, + Int maxTokens, + Int windowSize, + Boolean distanceWeighting, + Boolean symmetric) return (frame[Unknown] G){ - [cooc_matrix, cooc_index] = cooc::getCoocMatrix(input, 26000, 15, TRUE,TRUE); - G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); + /* + * Main function to Computes the vector embeddings for words in a large text corpus. + * INPUT: + * ------------------------------------------------------------------------------ + * - input (Frame[Unknown]): 1DInput corpus in CSV format. + * - seed: Random seed for reproducibility. + * - vector_size: Dimensionality of word vectors, V. + * - eta: Learning rate for optimization, recommended value: 0.05. + * - alpha: Weighting function parameter, recommended value: 0.75. + * - x_max: Maximum co-occurrence value as per the GloVe paper: 100. + * - tol: Tolerance value to avoid overfitting, recommended value: 1e-4. + * - iterations: Total number of training iterations. + * - print_loss_it: Interval (in iterations) for printing the loss. + * - maxTokens (Int): Maximum number of tokens per text entry. + * - windowSize (Int): Context window size. + * - distanceWeighting (Boolean): Whether to apply distance-based weighting. + * - symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE). + * ------------------------------------------------------------------------------ + * OUTPUT: + * ------------------------------------------------------------------------------ + * G (Frame[Unknown]): The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V) + * ------------------------------------------------------------------------------ + */ + + [cooc_matrix, cooc_index] = cooc::cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric); + G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); } -#input = read(§1, data_type="frame", format="csv", sep=","); -#G = glove(input, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); +#input = read("src/test/resources/datasets/20news/20news_subset_untokenized.csv", data_type="frame", format="csv", sep=","); +#G = glove(input[,4], seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it, 2600, 15, TRUE, TRUE); From 21c08627eada707b538001f42502492d9a361986 Mon Sep 17 00:00:00 2001 From: Samin <69201742+saminbassiri@users.noreply.github.com> Date: Tue, 4 Feb 2025 07:21:04 +0100 Subject: [PATCH 5/9] Update Builtins.java Add GloVe script in Builtins.java --- src/main/java/org/apache/sysds/common/Builtins.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index ab7400df447..0544d1e4446 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -151,6 +151,7 @@ public enum Builtins { GET_ACCURACY("getAccuracy", true), GLM("glm", true), GLM_PREDICT("glmPredict", true), + GLOVE("glove", true), GMM("gmm", true), GMM_PREDICT("gmmPredict", true), GNMF("gnmf", true), From 87fc8a14062926275254e34b30784a1628a3dea7 Mon Sep 17 00:00:00 2001 From: Samin <69201742+saminbassiri@users.noreply.github.com> Date: Tue, 4 Feb 2025 07:25:45 +0100 Subject: [PATCH 6/9] Add test script for glove word embedding. - This code first computes the cosine similarity of each pair of words in the glove result. - In the get_top function, the top k most similar words for each word is computed. - The result of this script used for testing. --- src/test/scripts/functions/builtin/glove.dml | 85 ++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 src/test/scripts/functions/builtin/glove.dml diff --git a/src/test/scripts/functions/builtin/glove.dml b/src/test/scripts/functions/builtin/glove.dml new file mode 100644 index 00000000000..53408f25ee4 --- /dev/null +++ b/src/test/scripts/functions/builtin/glove.dml @@ -0,0 +1,85 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +cosine_similarity = function(matrix[double] trained_emb) + return (matrix[double] cosine_sim){ + /* + * Computes cosine similarity between word embeddings. + * + * Inputs: + * - trained_emb: Matrix of word embeddings of shape (N, D), where N is the number of words and D is the embedding dimension. + * + * Outputs: + * - cosine_sim: Matrix of cosine similarity scores between word embeddings, shape (N, N). + */ + dot_product = trained_emb %*% t(trained_emb); + row_norms = rowSums(trained_emb^2) ^ 0.5; + denominator = row_norms %*% t(row_norms); + cosine_sim = dot_product / denominator; +} + +get_top = function(matrix[double] trained_emb, int k, Frame[Unknown] column) + return (Frame[Unknown] result){ + /* + * Retrieves the top k most similar word vectors for each word. + * + * Inputs: + * - trained_emb: Matrix of word embeddings of shape (N, D). + * - k: Number of top similar words to retrieve. + * - column: Frame containing the word column. + * + * Outputs: + * - result: Frame containing the top k most similar words for each word, shape (N, k+1). + * The first column contains the target word, followed by k columns with similar words. + */ + S = cosine_similarity(trained_emb); + n = nrow(S); + I = diag(matrix(1, rows=nrow(S), cols=1)); + S = S * (1 - I) + (-1.0 * I); + result = column; + for(i in 1:k){ + result = cbind(result, column); + } + for (i in 1:n){ + Scol = S[,i]; + topN = order(target=Scol, by=1, decreasing=TRUE, index.return=TRUE); + + for(j in 2:k+1){ + result[i, j] = column[as.integer(as.scalar(topN[j]))]; + } + } +} + +# Read input word embeddings +X = read($input, data_type="frame", format="csv", sep=",", header=FALSE); + +# compute glove result for input text +G = glove(X[,4], as.integer($seed), as.integer($vector_size), as.double($alpha), as.double($eta), as.integer($x_max), as.double($tol), as.integer($iterations), as.integer($print_loss_it), as.integer($maxTokens), as.integer($windowSize), $distanceWeighting, $symmetric); + +# Extract only the embeddings (excluding the word column) +X_column = G[,1]; +X_matrix = as.matrix(G[,2:ncol(G)]); + +# Compute top-K similar words +result = get_top(X_matrix, $topK, X_column); + +# Write results to output file +write(result, $out_result, data_type="frame", format="csv"); + From 179659cb714144fe3b7eb158aa411247e9701899 Mon Sep 17 00:00:00 2001 From: Samin <69201742+saminbassiri@users.noreply.github.com> Date: Tue, 4 Feb 2025 08:03:37 +0100 Subject: [PATCH 7/9] Create gloveExpectedTop10.csv - This data is used to test the DML script results for GloVe word embedding. - This file contains the top 10 most similar words for each word in the GloVe word embedding, based on (https://github.com/roamanalytics/mittens/tree/master). - The test dataset is provided under test/resources in '20news/20news_subset_untokenized.csv'. --- .../datasets/GloVe/gloveExpectedTop10.csv | 478 ++++++++++++++++++ 1 file changed, 478 insertions(+) create mode 100644 src/test/resources/datasets/GloVe/gloveExpectedTop10.csv diff --git a/src/test/resources/datasets/GloVe/gloveExpectedTop10.csv b/src/test/resources/datasets/GloVe/gloveExpectedTop10.csv new file mode 100644 index 00000000000..c736311e719 --- /dev/null +++ b/src/test/resources/datasets/GloVe/gloveExpectedTop10.csv @@ -0,0 +1,478 @@ +from,apart,decay,cfaehl,mathew,subject,discourage,prejudices,viceicotekcom,referring,words +decay,cbnewsjcbattcom,tammy,r,deankaflowitz,healy,from,cfaehl,chris,timmbake,faehl +cbnewsjcbattcom,decay,deankaflowitz,tammy,r,healy,might,wrong,faehl,intend,get +deankaflowitz,cbnewsjcbattcom,faehl,wrong,decay,subject,healy,might,intend,yet,r +subject,mantiscouk,re,deankaflowitz,faehl,clam,wrong,mathew,amusing,bake,correct +re,subject,rushdie,yet,wrong,correct,amusing,more,deankaflowitz,tell,faehl +about,knowledgeable,or,anything,worse,had,mathew,reference,faehl,pronoun,mantiscouk +the,post,of,wholesome,useful,more,ark,that,university,extermination,new +bible,quiz,example,using,reliable,baptist,wrt,answers,stated,rules,statements +quiz,answers,bible,university,rules,boasted,whethe,bank,instead,banking,mantis +answers,quiz,mantis,organization,university,consultants,at,boasted,bible,agnostics,law +organization,agnostics,law,mantis,university,answers,hard,consultants,at,pretty,proof +at,least,answers,blanket,mantis,distribution,consultants,believe,pretty,organization,world +t,wrodhipped,wasn,they,mm,and,haven,won,provided,another,content +distribution,na,lines,world,albuquerque,v,nntp,maybe,at,mm,lo +na,distribution,lines,world,albuquerque,nntp,v,says,jaeger,alyosha,dogma +lines,distribution,na,albuquerque,v,correction,jaeger,proselytize,world,conspiracy,this +in,article,engaged,evidence,useful,my,idol,of,for,lines,that +article,in,illustrate,engaged,allegory,guy,some,provide,healta,mere,if +healta,saturnwwcedu,tammy,if,article,fool,r,albuquerque,wants,nntp,too +saturnwwcedu,healta,tammy,r,if,guess,too,want,wants,created,idols +tammy,r,saturnwwcedu,decay,cbnewsjcbattcom,healy,healta,intend,deankaflowitz,bad,misinformed +r,healy,tammy,cbnewsjcbattcom,decay,deankaflowitz,saturnwwcedu,bad,mantiscouk,intend,writes +healy,r,tammy,cbnewsjcbattcom,decay,deankaflowitz,writes,cheribums,intend,bad,mantiscouk +writes,fallacy,guy,healy,timmons,viceicotekcom,mantiscouk,r,many,beauchaine,bake +cheribums,are,on,healy,arrogant,wont,hoping,deankaflowitz,more,but,as +are,arrogant,cheribums,wont,hoping,say,theists,who,on,arrogance,interpretation +on,year,cheribums,ark,reliable,statements,day,university,are,arrogant,rules +ark,covenant,of,theists,on,university,banking,new,the,atheism,rules +of,ark,the,university,new,time,atheism,group,down,millions,theists +covenant,when,wasn,ark,bible,waste,god,they,hard,t,example +when,god,covenant,wasn,wrodhipped,won,said,bible,make,re,offer +god,when,justify,does,make,followed,said,poster,exist,evidence,covenant +said,never,make,god,original,graven,when,had,millions,killed,na +make,said,no,god,graven,idol,questions,does,answers,when,followed +no,reason,have,make,occasion,wholesome,and,useful,agree,they,idol +graven,image,had,boasted,instead,aren,down,make,quiz,waste,whether +image,graven,had,whether,waste,determine,down,put,millions,he,instead +he,relied,by,whethe,implies,virtue,anything,upon,try,agnostics,innocent +was,refering,university,implies,koran,something,an,that,rules,adapted,meaning +refering,idols,was,something,implies,may,clear,prepared,position,meaning,mongols +to,articles,references,i,reason,want,created,you,not,determine,agree +idols,which,refering,support,out,created,pointing,kept,shines,may,itself +which,idols,created,support,were,out,intend,pointing,bobby,justifications,healy +were,bad,created,justifications,fool,which,reference,illustrate,compelling,very,healy +created,which,were,bad,support,out,justifications,fool,idols,grip,pointing +be,d,glad,must,fool,then,it,a,albuquerque,created,worse +worshipped,him,someone,loved,year,ark,join,buphybuedu,on,faith,wants +wasn,wrodhipped,covenant,when,t,day,won,atonement,they,bible,god +wrodhipped,wasn,t,atonement,won,day,and,only,they,when,titles +and,wholesome,him,useful,wrodhipped,mmm,questions,day,agnostics,amusing,titles +only,titles,worthy,wrodhipped,respond,m,does,seriously,how,bzzt,fit +high,priest,we,name,enter,could,defending,bzzt,virtue,fit,occasion +priest,high,enter,could,name,we,virtue,defending,fit,occasion,extermination +could,enter,too,priest,mongols,high,bad,therefore,d,we,am +enter,holy,could,priest,high,therefore,bad,stating,respond,cannot,times +holy,holies,enter,large,group,bcci,sources,kaflowitz,put,stating,down +holies,holy,where,large,dean,brothers,kaflowitz,out,group,bcci,case +where,post,holies,then,understand,must,it,d,call,case,bcci +it,is,be,justify,where,that,albuquerque,a,irrelevant,gregg,must +kept,once,shines,misinformation,intellectually,word,year,simply,beckoning,jaeger,serious +once,kept,beckoning,wonderful,year,man,again,vestaunmedu,arrogance,faq,nntp +a,simply,is,be,issued,faith,it,waste,albuquerque,title,lines +year,on,day,arrogant,kept,worshipped,issued,once,buphybuedu,cheribums,ark +day,atonement,idol,wrodhipped,whether,year,on,wasn,determine,and,his +atonement,day,am,wrodhipped,and,wasn,idol,fit,time,look,someone +i,not,but,am,lo,albuquerque,m,mexico,to,for,new +am,referring,atonement,prepared,familiar,not,i,m,lo,we,words +not,i,am,familiar,prepared,an,even,time,worthy,looking,words +familiar,misinformed,worthy,respond,articles,am,knowledgeable,or,not,with,words +with,articles,agree,moslems,atheists,makes,respond,amusing,familiar,you,do +or,knowledgeable,familiar,reference,seriously,about,knows,respond,adapted,neil,bake +knowledgeable,or,about,familiar,reference,defending,yet,deankaflowitz,clam,bake,respond +original,language,dogma,reference,had,my,said,illustrate,point,statement,include +language,original,but,extermination,apologize,qualifying,my,had,illustrate,want,guess +but,language,i,guess,etc,merely,sorry,cheribums,using,can,mexico +believe,reason,at,anyone,is,meant,there,least,iii,maybe,wont +there,iii,know,s,believe,doubtless,at,titles,extermination,nothing,year +is,it,a,that,reason,albuquerque,evidence,irrelevant,believe,how,useful +word,respect,instead,idol,titles,kept,used,for,using,report,buphybuedu +for,titles,evidence,fit,respect,real,mm,upon,word,i,occasion +idol,day,titles,word,whether,determine,fit,provide,nothing,make,instead +that,is,it,idea,useful,wont,was,the,evidence,s,say +translator,would,used,justify,bcci,mmm,might,grip,his,large,occasion +would,translator,grip,call,used,mmm,irrelevant,man,atheist,be,rule +have,no,cannot,been,lost,referring,may,grip,reason,used,man +used,translator,indicting,all,would,man,instead,have,word,while,grip +instead,boasted,graven,word,all,stalin,indicting,determine,quiz,report,mere +had,image,graven,whether,original,determine,proof,language,millions,about,said +so,although,proof,wont,august,compelling,let,must,by,pointing,kill +think,makes,m,may,correct,defending,my,moslems,wrong,amusing,what +you,agree,articles,with,tell,correct,that,to,a,yet,the +wrong,here,deankaflowitz,might,re,faehl,correct,kaflowitz,dean,subject,cbnewsjcbattcom +here,wrong,might,dogmatic,anyway,understand,posting,deankaflowitz,see,wonderful,cbnewsjcbattcom +then,case,must,prove,where,seriously,be,d,it,again,get +again,wonderful,while,times,post,want,every,reader,then,lost,stating +too,could,glad,suggesting,just,blanket,issued,saturnwwcedu,d,apart,meant +just,mm,suggesting,ve,got,too,issued,special,looking,for,provided +suggesting,just,mm,issued,way,too,correction,looking,discussion,conspiracy,real +way,suggesting,mm,determine,because,this,say,just,such,occasion,another +determine,whether,flocking,people,way,day,instead,idol,had,image,sophisticated +whether,determine,interpretation,flocking,day,idol,innocent,had,people,image,instead +interpretation,offer,whether,his,they,won,atheists,determine,idol,group,people +offer,interpretation,correct,didn,won,moslems,haven,atheists,because,absolutely,wrong +correct,dean,didn,offer,wrong,kaflowitz,re,oranges,subject,think,clam +dean,kaflowitz,correct,wrong,today,loved,others,holies,kill,christians,absolutely +kaflowitz,dean,others,wrong,correct,loved,mcl,unalterably,under,absolutely,holies +cfaehl,host,vestaunmedu,chris,apart,decay,from,mathew,thought,timmbake,achieve +vestaunmedu,host,cfaehl,chris,beckoning,did,issue,defending,discuss,given,fit +chris,faehl,cfaehl,vestaunmedu,decay,deankaflowitz,host,cbnewsjcbattcom,might,mathew,mantiscouk +faehl,chris,deankaflowitz,yet,subject,mantiscouk,wrong,bake,didn,clam,cbnewsjcbattcom +amusing,yet,atheists,mclucsbedu,with,re,subject,clam,mmm,moslems,and +atheists,moslems,do,amusing,bzzt,articles,with,today,christians,oranges,agree +agnostics,university,organization,mantis,koran,consultants,answers,and,law,reason,whethe +university,agnostics,new,organization,answers,mantis,quiz,ark,consultants,koran,was +new,mexico,albuquerque,university,ark,of,cambridge,consultants,hear,uk,x +mexico,new,albuquerque,cambridge,uk,x,lo,rusnews,hear,newsreader,i +albuquerque,mexico,new,lines,v,rusnews,distribution,na,lo,newsreader,be +world,nntp,distribution,na,v,maybe,rusnews,must,posting,anyway,says +nntp,world,posting,host,v,rusnews,says,distribution,maybe,na,jaeger +posting,nntp,host,see,world,here,anyway,wonderful,cfaehl,excuse,propaganda +host,vestaunmedu,cfaehl,posting,nntp,beckoning,thought,achieve,chris,world,wonderful +timmbake,mcl,mclucsbedu,decay,cfaehl,cbnewsjcbattcom,clam,chris,prove,deankaflowitz,claims +mcl,timmbake,mclucsbedu,claims,kaflowitz,clam,justifications,chris,cfaehl,fashion,bobbe +mclucsbedu,clam,mcl,timmbake,amusing,deankaflowitz,faehl,bobbe,didn,yet,wrong +clam,mclucsbedu,bake,didn,timmons,subject,faehl,deankaflowitz,amusing,knowledgeable,mcl +bake,timmons,clam,iii,faehl,subject,beauchaine,mclucsbedu,wrong,knowledgeable,explained +timmons,bake,iii,clam,fallacy,writes,bennett,beauchaine,dostoevsky,neil,christians +fallacy,many,writes,timmons,christians,hard,subject,atheism,bake,respond,amusing +atheism,hard,whethe,merely,of,university,recognizing,ark,has,fallacy,new +faith,lo,special,yes,a,mm,jaeger,someone,worshipped,place,maybe +lo,hear,albuquerque,greater,btw,faith,apology,v,i,mexico,am +hear,lo,faq,newsreader,v,beckoning,rusnews,mexico,albuquerque,supports,new +faq,etc,beckoning,hear,name,guess,banking,let,times,once,burden +beckoning,once,faq,vestaunmedu,host,supports,hear,times,kept,nntp,posting +wonderful,job,rule,deleted,again,don,slander,although,posting,understand,here +rule,don,deleted,wonderful,special,mmm,medium,haven,slippery,call,pronoun +deleted,wrt,medium,rule,wonderful,slander,because,they,arrogant,slippery,agree +didn,correct,clam,faehl,offer,mclucsbedu,anything,subject,group,mantiscouk,other +say,such,are,other,fool,they,than,way,didn,because,arrogant +anything,other,piece,didn,about,fool,than,he,memory,say,usenet +conspiracy,correction,greater,mongols,discussion,looking,suggesting,waste,whethe,sorry,lo +correction,conspiracy,looking,waste,lines,suggesting,discussion,mongols,greater,simply,sorry +hard,atheism,indictment,fallacy,organization,correction,of,covenant,has,what,not +yes,mmm,enough,faith,we,any,stronger,robert,what,why,agnostics +don,rule,mix,wonderful,aren,his,apples,why,propaganda,deleted,slanderous +mix,apples,don,aren,superior,also,supporting,oranges,unsympathetic,atheists,did +apples,mix,oranges,superior,don,atheists,aren,unsympathetic,wonderful,moslems,also +oranges,apples,moslems,do,state,understand,correct,atheists,respond,misinformed,how +how,seriously,many,oranges,respond,today,can,do,religion,only,is +can,tell,how,oranges,religion,fool,i,want,references,but,good +extermination,mongols,by,times,language,august,conspiracy,banking,burden,the,doubtless +by,we,enough,extermination,name,he,pointing,qualifying,bzzt,virtue,apologize +mongols,extermination,conspiracy,clear,correction,could,worse,whethe,by,enough,hear +worse,than,stimulating,glad,mongols,other,fool,debate,greater,be,about +than,other,life,worse,greater,words,anything,some,say,mm,conspiracy +stalin,looking,instead,khan,killed,kept,brought,did,issued,prepared,simply +khan,conquered,prepared,greater,through,explained,stalin,blanket,stimulating,shines,unsympathetic +conquered,khan,unsympathetic,prepared,nntp,through,cfaehl,world,explained,greater,blanket +people,determine,whether,articles,koran,will,adapted,exist,of,discourage,own +unsympathetic,conquered,aren,causethat,prove,bobby,lost,khan,apples,may,alyosha +his,flocking,don,interpretation,all,causethat,day,baptist,translator,atrociousbut,provide +causethat,atrociousbut,neil,unsympathetic,bennett,an,alyosha,anti,his,aren,slander +atrociousbut,causethat,killed,millions,an,slander,his,example,stalin,mathew,arrogance +killed,millions,atrociousbut,stalin,while,all,looking,reader,brought,said,reliable +millions,killed,atrociousbut,reader,of,burden,graven,they,had,own,said +own,prejudices,reader,every,millions,people,fit,your,theists,from,idol +who,wish,baptist,boasted,loved,are,arrogant,thanks,exist,sorry,something +loved,kaflowitz,dean,bzzt,who,worshipped,baptist,boasted,burden,propaganda,slanderous +him,join,wholesome,stronger,and,yet,worshipped,discuss,more,rushdie,include +atheist,state,soccultureislam,place,got,good,irrelevant,ve,mmm,would,special +state,atheist,absolutely,unalterably,oranges,christians,today,many,religion,clam,mclucsbedu +anyone,irrelevant,been,believe,be,respect,convinced,wont,glad,thanks,iii +will,discourage,apologize,kill,cambridge,explain,one,uk,people,x,consultants +explain,may,referring,apologize,will,otherwise,prepared,lo,lost,this,says +this,contention,supports,explain,way,lines,lost,times,excuse,support,distribution +did,nothing,vestaunmedu,mathew,discuss,higher,has,occasion,report,man,august +nothing,has,supports,did,higher,fit,stronger,idol,times,titles,only +name,by,under,let,virtue,any,high,bzzt,priest,faq,enough +whethe,he,debate,doubtless,quiz,atheism,using,recognizing,relied,mongols,conspiracy +an,indictment,causethat,not,allegory,anti,example,islamic,as,got,was +irrelevant,get,justify,anyone,usenet,it,d,would,is,atheist,deankaflowitz +get,irrelevant,idea,might,deankaflowitz,cbnewsjcbattcom,justify,become,grip,then,bcci +grip,man,would,d,issued,created,have,glad,get,blanket,mantiscouk +man,grip,while,used,did,have,issued,blanket,would,once,issue +example,bible,as,arrogance,reliable,sources,statements,using,baptist,recognizing,koran +brought,up,statements,looking,stalin,blanket,reliable,achieve,example,issue,killed +up,brought,statements,look,suggesting,indictment,why,blanket,worthy,allegory,wants +as,example,arrogance,superior,guilty,itself,reliable,merely,sources,more,indictment +indictment,an,soccultureislam,proof,hard,as,atheism,time,burden,got,up +merely,recognizing,using,as,atheism,but,arrogance,example,bennett,been,indictment +another,superior,anti,down,such,because,t,beauchaine,time,they,alyosha +kill,others,file,my,illustrate,apologize,will,point,discourage,under,dean +others,under,kill,stimulating,file,kaflowitz,debate,pretty,dean,explained,qualifying +under,others,stimulating,name,debate,discourage,pretty,any,btw,dogmatic,clear +any,position,dogmatic,questions,under,name,knows,m,we,defending,yes +s,fit,what,there,second,name,let,many,titles,that,doubtless +fit,occasion,titles,s,for,respect,let,nothing,idol,excuse,high +occasion,fit,titles,no,did,respect,high,for,times,priest,stating +look,up,become,soccultureislam,special,occasion,convinced,content,atonement,brought,supports +while,slander,never,articles,again,job,man,killed,agree,superior,mmm +never,said,while,meant,special,place,supporting,mmm,makes,agree,looking +implication,pretty,very,kill,sources,intellectually,proof,justifications,burden,medium,don +pretty,clear,implication,others,intellectually,under,enough,answers,very,least,mantis +clear,pretty,enough,mongols,refering,sorry,under,others,intellectually,btw,organization +m,defending,think,sorry,worthy,am,only,i,any,religion,may +sorry,least,m,proof,providing,greater,btw,conspiracy,clear,correction,whethe +respond,therefore,misinformed,cannot,also,familiar,with,articles,seriously,religion,only +your,contention,spend,innocent,time,words,referring,rusnews,true,own,am +words,familiar,prepared,your,prove,than,am,referring,misinformed,true,not +true,memory,alyosha,usenet,unalterably,meaning,want,your,words,good,dogma +meaning,usenet,thought,justify,dogma,true,worse,simply,was,refering,others +usenet,meaning,justify,simply,memory,irrelevant,true,dogma,fool,absolutely,alyosha +slippery,medium,issued,simply,obviously,piece,rule,deleted,spreading,waste,justify +medium,slippery,wrt,deleted,rule,issued,baptist,implication,provided,pronoun,intellectually +wrt,burden,deleted,medium,baptist,bible,all,reliable,theists,pronoun,arrogant +burden,wrt,proof,august,arrogance,theists,baptist,reliable,boasted,include,all +proof,burden,indictment,let,sorry,so,arrogance,had,bank,organization,answers +has,nothing,higher,prove,pointing,misinformed,did,out,put,sophisticated,atheism +prove,then,unsympathetic,seriously,religion,misinformed,has,want,bobby,words,given +does,justify,god,exist,only,says,cannot,make,it,case,one +justify,does,irrelevant,god,usenet,translator,evidence,it,get,bcci,today +exist,know,wish,does,people,prejudices,glad,god,have,who,been +know,there,exist,apology,absolutely,btw,won,iii,conspiracy,explained,provided +etc,faq,guess,providing,but,hear,propaganda,see,priest,sorry,extermination +guess,etc,ass,but,justifications,if,faq,saturnwwcedu,fashion,given,burden +what,many,ass,s,makes,think,second,m,that,i,for +if,see,someone,healta,saturnwwcedu,do,guess,wants,lost,ass,propaganda +those,justifications,out,very,pointing,title,sources,itself,engaged,pretty,if +justifications,those,out,were,very,fool,intellectually,pointing,created,sources,illustrate +compelling,aren,although,why,others,thread,out,justifications,kill,so,were +why,thread,bank,aren,compelling,don,up,yes,special,excuse,atonement +aren,compelling,unsympathetic,why,thread,graven,don,flocking,bank,mix,arrogance +flocking,whether,determine,his,aren,down,put,all,has,spend,articles +they,won,pronoun,t,because,theists,useful,wrodhipped,deleted,say,interpretation +won,they,wrodhipped,haven,offer,recognizing,wasn,when,and,provided,know +one,greater,discourage,boasted,nntp,anyway,misinformation,will,lo,debate,uk +discourage,one,will,under,others,apologize,kill,bzzt,apology,from,prejudices +pointing,out,by,put,justifications,has,those,which,claims,down,created +out,pointing,those,justifications,which,created,support,idols,mere,compelling,has +very,sources,intellectually,justifications,implication,those,pretty,out,title,stimulating,were +sources,very,reliable,fashion,example,same,intellectually,as,justifications,bible,large +reliable,statements,sources,bible,fashion,example,indicting,on,as,wrt,include +statements,reliable,indicting,brought,up,example,bible,on,all,issue,rules +supporting,dogmatic,charged,islam,never,wonderful,wrong,here,mix,questions,think +dogmatic,position,any,here,might,supporting,we,understand,btw,anyway,may +position,dogmatic,any,claims,might,under,although,wrong,intend,refering,if +d,be,glad,fool,then,grip,must,world,irrelevant,where,created +fool,were,justifications,be,d,created,say,worse,anything,usenet,illustrate +large,group,holy,discuss,reliable,sources,holies,doubtless,didn,bcci,translator +group,large,holy,didn,of,holies,down,idol,interpretation,arrogance,provide +wish,proselytize,boasted,exist,who,prepared,world,glad,khan,conquered,says +proselytize,wish,lines,same,conspiracy,mere,correction,implies,report,suggesting,mongols +same,fashion,sources,proselytize,include,reliable,writes,bible,mcl,implication,allegory +fashion,same,sources,reliable,include,above,bible,reference,allegory,statements,mcl +religion,given,seriously,prove,defending,respond,many,worthy,can,state,how +many,christians,ass,fallacy,what,my,today,qualifying,how,do,writes +do,atheists,oranges,today,christians,with,wont,many,moslems,articles,sophisticated +see,if,propaganda,posting,let,slanderous,someone,here,excuse,some,ass +anyway,maybe,here,nntp,one,world,posting,dogmatic,misinformation,serious,intend +maybe,anyway,mm,world,misinformation,nntp,distribution,serious,glad,lo,shines +mm,just,suggesting,maybe,way,looking,t,faith,misinformation,for,life +looking,correction,waste,stalin,suggesting,conspiracy,special,mm,brought,even,second +enough,second,by,clear,pretty,yes,we,name,looking,blanket,mongols +second,enough,makes,looking,s,what,a,lines,stalin,simply,waste +makes,second,think,with,what,never,amusing,article,how,my,correct +defending,given,understand,m,worthy,religion,knowledgeable,might,high,think,wrong +given,defending,religion,understand,although,prove,oranges,many,dogmatic,others,seriously +recognizing,using,merely,baptist,whethe,example,won,atheism,bible,reliable,sorry +meant,never,believe,too,reason,glad,charged,worshipped,wants,be,wish +although,understand,compelling,must,given,lost,we,wonderful,so,qualifying,oranges +understand,although,given,might,defending,where,oranges,here,dogmatic,wonderful,kaflowitz +might,get,idea,here,wrong,understand,deankaflowitz,cbnewsjcbattcom,dogmatic,faehl,iii +idea,get,might,convinced,become,debate,doubtless,btw,stimulating,that,mexico +using,recognizing,merely,bible,example,all,whethe,baptist,arrogance,but,theists +allegory,illustrate,alyosha,article,an,memory,engaged,statement,firmly,justifications,fashion +illustrate,allegory,point,my,article,engaged,head,kill,dogma,right,alyosha +my,point,dogma,illustrate,many,kill,engaged,original,ass,head,think +point,illustrate,my,bobbe,qualifying,kill,provide,christians,do,atheists,file +we,by,qualifying,apologize,let,although,high,dogmatic,higher,referring,enough +referring,am,lost,every,excuse,may,prepared,we,have,explain,bobby +every,reader,referring,bobby,thread,all,lost,join,theists,own,again +reader,every,theists,post,all,times,own,spend,thread,millions,arrogance +post,where,the,rules,adapted,bcci,koran,reader,john,again,banking +evidence,for,upon,justify,is,in,relied,that,won,thanks,god +poster,stated,such,bible,doubtless,god,debate,convinced,absolutely,justify,chris +stated,poster,doubtless,debate,bible,absolutely,such,stimulating,convinced,chris,relied +relied,upon,he,debate,whethe,doubtless,evidence,under,stated,by,unalterably +upon,relied,evidence,for,he,thought,discussion,unalterably,whethe,debate,priest +may,lost,explain,referring,think,refering,have,prepared,dogmatic,btw,least +lost,may,thread,referring,bobby,although,unsympathetic,have,supports,every,explain +thread,lost,why,supports,aren,all,indicting,every,someone,compelling,reader +theists,baptist,all,ark,arrogant,reader,burden,boasted,are,koran,wrt +arrogant,are,theists,baptist,thanks,hoping,cheribums,year,arrogance,wont,deleted +because,they,such,way,deleted,down,another,references,say,offer,life +such,absolutely,say,poster,because,stated,unalterably,doubtless,another,and,pronoun +absolutely,unalterably,such,state,stated,know,doubtless,spreading,kaflowitz,usenet,under +unalterably,absolutely,state,true,such,kaflowitz,alyosha,memory,usenet,dean,others +dogma,says,my,today,original,alyosha,memory,otherwise,illustrate,place,thought +says,otherwise,dogma,nntp,world,jaeger,does,na,explain,wish,it +prepared,khan,am,issue,conquered,not,wish,referring,words,may,blanket +issue,prepared,discuss,blanket,stronger,statements,vestaunmedu,more,higher,khan,did +blanket,issued,at,too,indicting,issue,khan,prepared,conquered,man,enough +indicting,all,statements,used,reliable,mathew,thread,achieve,john,blanket,instead +all,indicting,theists,thread,achieve,instead,used,john,baptist,arrogance,wrt +arrogance,example,as,burden,all,arrogant,wont,proof,theists,wrt,reader +wont,are,do,file,arrogant,hoping,arrogance,cheribums,so,that,kill +bzzt,virtue,atheists,apologize,qualifying,moslems,loved,by,high,name,discourage +virtue,bzzt,name,by,high,he,innocent,priest,idol,fit,own +innocent,little,pronoun,whether,your,virtue,instead,he,whethe,graven,provide +little,innocent,pronoun,apart,discuss,high,whether,priest,they,translator,some +pronoun,little,innocent,they,wrt,rule,all,such,won,about,john +ve,got,soccultureislam,just,issued,call,good,rule,atheist,suggesting,not +issued,blanket,suggesting,just,slippery,simply,ve,grip,too,a,medium +statement,illustrate,allegory,least,at,thought,my,viceicotekcom,if,head,unsympathetic +least,at,mantis,sorry,apologize,may,pretty,consultants,anyway,statement,blanket +apologize,qualifying,will,bzzt,must,we,kill,language,explain,by,least +qualifying,apologize,we,many,bzzt,point,others,by,language,christians,although +place,special,atheist,got,never,dogma,rule,causethat,mmm,obviously,faith +call,john,baptist,would,indicting,got,slander,rule,all,job,post +john,call,baptist,all,post,indicting,issued,banking,theists,wrt,rules +baptist,john,theists,boasted,wrt,bible,arrogant,who,rules,call,all +boasted,instead,graven,baptist,wish,answers,quiz,one,who,greater,theists +greater,one,conspiracy,khan,lo,than,correction,boasted,debate,sorry,worse +christians,many,today,ass,do,atheists,state,fallacy,qualifying,timmons,piece +today,christians,dogma,many,dean,do,atheists,state,moslems,with,justify +itself,claims,guilty,as,superior,justifications,those,anti,out,sources,charged +guilty,charged,itself,as,claims,firmly,engaged,superior,example,arrogance,head +charged,guilty,supporting,as,looking,itself,again,meant,wrong,too,brought +other,than,anything,say,worse,some,life,didn,simply,conspiracy,fool +thought,meaning,host,cfaehl,btw,apology,dogma,my,nntp,upon,convinced +claims,itself,superior,guilty,mcl,position,pointing,out,justifications,as,beckoning +superior,claims,as,another,itself,slander,apples,do,while,anti,guilty +thanks,arrogant,baptist,apology,special,who,evidence,theists,serious,misinformation,for +apology,btw,excuse,lo,thought,jaeger,thanks,know,misinformation,otherwise,shines +btw,apology,excuse,lo,thought,idea,dogmatic,under,sorry,may,iii +worthy,familiar,only,seriously,misinformed,defending,m,not,religion,higher,statements +seriously,misinformed,worthy,how,religion,prove,then,therefore,respond,or,bad +misinformed,seriously,familiar,respond,worthy,sophisticated,prove,join,articles,has,oranges +sophisticated,put,down,articles,misinformed,provide,time,do,familiar,has,support +put,down,sophisticated,provide,pointing,time,has,out,flocking,graven,determine +down,put,sophisticated,provide,graven,time,pointing,flocking,image,of,useful +serious,misinformation,shines,rusnews,boasted,maybe,v,theists,nntp,anyway,kept +misinformation,serious,shines,maybe,rusnews,kept,v,through,glad,nntp,one +shines,through,misinformation,serious,explained,kept,providing,discussion,khan,apology,rusnews +through,explained,shines,providing,discussion,khan,misinformation,others,conquered,kept,btw +explained,through,shines,providing,discussion,khan,above,others,bake,stimulating,conquered +above,reference,explained,fashion,include,reliable,bake,unalterably,absolutely,or,religion +iii,there,timmons,bake,dostoevsky,might,btw,apology,know,wrong,mathew +higher,stronger,nothing,has,we,did,issue,supports,misinformed,worthy,join +stronger,higher,wholesome,more,him,issue,yet,nothing,rushdie,join,yes +more,yet,wholesome,rushdie,stronger,useful,re,seriously,him,as,the +wholesome,stronger,more,him,and,useful,rushdie,yet,the,no,join +useful,life,wholesome,more,and,the,rushdie,they,yet,down,that +life,useful,than,other,they,mm,looking,some,worse,because,down +some,guy,viceicotekcom,ass,bobbe,other,someone,wants,article,than,good +good,memory,islam,familiar,knows,atheist,viceicotekcom,some,ve,alyosha,true +memory,alyosha,true,good,dogma,usenet,allegory,try,illustrate,unalterably,anything +alyosha,memory,firmly,dogma,brothers,allegory,head,true,engaged,illustrate,causethat +brothers,karamazov,dostoevsky,alyosha,holies,obviously,otherwise,simply,kept,unalterably,providing +karamazov,dostoevsky,brothers,file,obviously,others,otherwise,iii,kill,atrociousbut,idols +dostoevsky,karamazov,brothers,iii,otherwise,file,timmons,others,right,obviously,arrogance +mathew,achieve,mantiscouk,indicting,cfaehl,subject,apart,faehl,did,all,chris +mantiscouk,mathew,subject,faehl,viceicotekcom,bobbe,guy,yet,r,chris,deankaflowitz +yet,rushdie,more,amusing,faehl,re,wholesome,deankaflowitz,him,knowledgeable,mclucsbedu +rushdie,yet,more,re,wholesome,useful,bcci,stronger,law,tell,amusing +islamic,anti,slander,been,law,job,an,rushdie,followed,bcci,implies +law,mantis,organization,islamic,answers,rushdie,agnostics,consultants,pretty,implies,anti +mantis,consultants,law,answers,organization,agnostics,cambridge,university,least,at,quiz +consultants,mantis,cambridge,answers,uk,university,agnostics,at,new,law,least +cambridge,uk,consultants,x,mantis,mexico,will,new,hoping,also,bobby +uk,x,cambridge,newsreader,consultants,mexico,new,one,will,lo,albuquerque +x,newsreader,uk,cambridge,rusnews,mexico,albuquerque,new,hear,will,lo +newsreader,x,rusnews,v,uk,hear,albuquerque,mexico,new,lo,serious +rusnews,v,newsreader,albuquerque,x,misinformation,world,serious,nntp,hear,mexico +v,rusnews,newsreader,lines,world,albuquerque,hear,nntp,distribution,jaeger,misinformation +jaeger,buphybuedu,otherwise,obviously,v,nntp,spreading,apology,lines,simply,special +buphybuedu,jaeger,gregg,v,worshipped,respect,year,let,someone,august,word +gregg,buphybuedu,questions,haven,jaeger,it,respect,islam,rules,justify,where +viceicotekcom,bobbe,robert,guy,some,beauchaine,mantiscouk,writes,mclucsbedu,good,from +bobbe,viceicotekcom,robert,point,mantiscouk,mclucsbedu,some,yet,guy,mcl,christians +robert,beauchaine,viceicotekcom,bobbe,guy,mantiscouk,yes,bake,faehl,provided,some +beauchaine,robert,bennett,guy,viceicotekcom,bake,timmons,neil,writes,clam,mantiscouk +bennett,neil,cannot,beauchaine,causethat,timmons,bake,also,knowledgeable,merely,or +neil,bennett,cannot,causethat,adapted,beauchaine,timmons,also,or,irrelevant,been +bcci,adapted,post,bad,rushdie,therefore,also,justify,holy,knows,translator +adapted,koran,bcci,post,implies,something,respond,neil,or,tell,knows +koran,adapted,rules,knows,post,managed,agnostics,include,university,baptist,theists +rules,banking,koran,quiz,post,baptist,bible,example,followed,ark,on +banking,rules,followed,ark,quiz,post,times,bible,john,baptist,spend +times,stating,august,extermination,beckoning,again,reader,banking,nothing,occasion,burden +august,let,times,discuss,burden,did,idol,extermination,so,proof,buphybuedu +let,august,we,propaganda,fit,see,proof,someone,name,s,so +guy,some,viceicotekcom,writes,robert,beauchaine,mantiscouk,bobbe,mere,article,someone +piece,obviously,spreading,anything,real,content,right,moslems,slippery,next,christians +title,implies,waste,looking,mmm,simply,even,those,very,a,something +implies,something,title,refering,adapted,was,he,an,law,organization,people +something,implies,refering,adapted,was,case,title,koran,who,proselytize,itself +case,then,must,stating,something,where,holies,contention,does,wish,prove +must,then,be,apologize,world,although,case,where,d,nntp,glad +haven,provided,questions,won,islam,gregg,rule,wrodhipped,offer,slanderous,t +provided,even,haven,won,bank,bake,clam,just,t,rule,viceicotekcom +even,managed,provided,not,looking,title,include,just,knows,won,questions +support,which,idols,spend,created,contention,out,bobby,sophisticated,job,pointing +contention,your,support,spend,this,case,stating,respond,put,times,bobby +intend,deankaflowitz,which,cbnewsjcbattcom,tammy,job,healy,r,anyway,christians,position +respect,titles,word,fit,for,occasion,gregg,says,buphybuedu,anyone,have +questions,haven,gregg,islam,any,and,knows,agnostics,titles,make,respect +managed,even,include,koran,statements,example,implies,agnostics,provided,adapted,blanket +include,managed,reference,koran,reliable,fashion,bible,above,burden,same,example +reference,above,include,head,or,knowledgeable,with,original,were,fashion,alyosha +head,firmly,engaged,reference,alyosha,illustrate,my,didn,causethat,correct,guilty +firmly,head,engaged,alyosha,guilty,illustrate,allegory,article,causethat,my,place +engaged,firmly,head,ass,in,article,illustrate,my,alyosha,allegory,obviously +ass,many,what,engaged,christians,some,excuse,guess,obviously,my,spreading +excuse,supports,referring,btw,apology,apart,ass,fit,v,might,posting +supports,excuse,nothing,thread,beckoning,lost,hear,this,higher,might,did +reason,believe,moslems,no,is,try,have,agnostics,to,meant,agree +anti,slander,islamic,job,an,another,causethat,superior,itself,example,which +slander,job,anti,islamic,while,wonderful,deleted,deankaflowitz,superior,knows,call +job,slander,anti,wonderful,islamic,apart,while,deankaflowitz,intend,which,call +also,respond,agree,cannot,bcci,stating,articles,bennett,atheists,arrogance,neil +apart,from,cfaehl,job,mathew,excuse,decay,little,slander,btw,serious +prejudices,own,report,discourage,from,exist,hear,priest,fit,mere,high +titles,fit,respect,only,for,idol,occasion,word,wrodhipped,and,questions +real,content,discussion,piece,providing,for,suggesting,titles,obviously,title,conspiracy +content,real,discussion,piece,obviously,spreading,for,t,engaged,providing,look +want,bobby,prove,again,references,true,to,join,lost,wonderful,although +tell,knows,can,oranges,re,deankaflowitz,rushdie,you,slander,adapted,job +bank,why,quiz,aren,whethe,provided,proof,graven,compelling,clam,answers +mere,report,instead,out,guy,did,article,pointing,conquered,correction,proselytize +report,mere,instead,prejudices,did,wish,otherwise,fit,proselytize,word,conquered +stating,times,cannot,also,case,oranges,again,been,hoping,followed,enter +followed,banking,god,rules,been,might,islamic,stating,idea,won,get +knows,islam,tell,koran,deankaflowitz,or,any,slander,good,bcci,questions +islam,knows,questions,good,haven,supporting,wrong,gregg,any,dean,slanderous +bad,therefore,were,created,r,seriously,bcci,healy,could,which,cannot +therefore,bad,cannot,respond,seriously,bcci,could,enter,r,created,misinformed +cannot,therefore,been,neil,bennett,respond,stating,have,also,does,bcci +been,cannot,islamic,have,followed,anyone,stating,neil,merely,glad,idea +otherwise,says,obviously,jaeger,dogma,discussion,dostoevsky,providing,explain,apology,simply +obviously,spreading,slanderous,piece,otherwise,jaeger,discussion,simply,content,ass,slippery +spreading,obviously,slanderous,propaganda,piece,jaeger,discussion,ass,slippery,absolutely,content +slanderous,propaganda,spreading,obviously,see,haven,don,posting,loved,jaeger,islam +propaganda,slanderous,spreading,see,let,someone,don,posting,obviously,loved,through +someone,wants,if,let,propaganda,see,thread,worshipped,some,lost,discuss +wants,someone,discuss,provide,some,if,worshipped,determine,saturnwwcedu,meant,up +discuss,provide,wants,issue,august,him,large,did,join,vestaunmedu,someone +glad,d,be,too,worse,misinformation,lo,issued,maybe,wish,must +discussion,providing,real,through,content,explained,obviously,shines,correction,conspiracy,suggesting +providing,discussion,through,shines,explained,real,etc,sorry,otherwise,conspiracy,btw +references,articles,provide,want,illustrate,to,try,because,bobby,can,respond +provide,down,discuss,put,wants,references,articles,sophisticated,idol,point,article +articles,agree,with,atheists,sophisticated,familiar,references,while,respond,you,provide +agree,articles,with,mmm,you,also,moslems,while,atheists,amusing,deleted +mmm,yes,agree,rule,and,amusing,title,would,atheists,try,while +intellectually,stimulating,very,justifications,debate,kept,pretty,sources,others,file,clear +stimulating,intellectually,debate,others,doubtless,under,worse,file,khan,idea,stated +debate,doubtless,stimulating,others,whethe,under,stated,intellectually,idea,relied,greater +doubtless,debate,stimulating,stated,whethe,idea,poster,absolutely,such,relied,large +spend,time,support,your,contention,reader,bobby,respond,banking,flocking,every +time,spend,soccultureislam,waste,try,down,sophisticated,put,of,your,not +soccultureislam,got,ve,time,indictment,atheist,look,worthy,spend,banking,won +got,ve,special,soccultureislam,just,call,place,atheist,an,issued,rule +special,place,got,rule,looking,never,faith,simply,just,jaeger,thanks +file,right,kill,others,next,stimulating,wont,intellectually,dostoevsky,hoping,piece +right,next,file,illustrate,piece,moslems,kill,dostoevsky,didn,anything,allegory +next,right,bobby,file,moslems,piece,illustrate,which,respond,anything,determine +bobby,want,lost,next,unsympathetic,support,every,which,prove,familiar,referring +join,him,misinformed,every,wholesome,bobby,yet,discuss,burden,stronger,worshipped +become,convinced,idea,look,might,get,issue,must,issued,then,here +convinced,become,idea,debate,stated,must,doubtless,poster,thought,issued,worse +simply,waste,a,obviously,usenet,issued,slippery,special,looking,jaeger,kept +waste,simply,time,correction,looking,title,conspiracy,image,graven,a,suggesting +try,time,reason,memory,mmm,sophisticated,references,under,he,bzzt,put +moslems,atheists,with,reason,oranges,bzzt,do,next,agree,piece,today +hoping,achieve,are,arrogant,wont,cheribums,file,has,didn,cambridge,stating +achieve,mathew,hoping,all,indicting,host,cfaehl,mantiscouk,which,did,thread From ef37bcbb9f20ea9914bb186ce18e4b05239996ed Mon Sep 17 00:00:00 2001 From: Samin <69201742+saminbassiri@users.noreply.github.com> Date: Tue, 4 Feb 2025 08:16:37 +0100 Subject: [PATCH 8/9] Add BuiltinGloVeTest.java - This test first runs the DML script to generate the top K most similar words for each word in the GloVe word embedding. - Then, it computes the accuracy of the DML results based on the hits of the most similar words for the entire vocabulary, comparing the expected results with the DML output. - To validate the correctness of our GloVe word embedding implementation, we employ a Controlled Overfitting Validation approach. - This methodology addresses the inherent challenge of testing stochastic algorithms, where random initialization typically prevents direct output comparison between different runs or implementations. --- .../builtin/part1/BuiltinGloVeTest.java | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinGloVeTest.java diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinGloVeTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinGloVeTest.java new file mode 100644 index 00000000000..b0b4bec781c --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinGloVeTest.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part1; + +import java.io.IOException; +import java.util.Objects; + +import org.apache.sysds.common.Types; +import org.apache.sysds.common.Types.FileFormat; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.junit.Test; + +public class BuiltinGloVeTest extends AutomatedTestBase { + + private static final String TEST_NAME = "glove"; + private static final String TEST_DIR = "functions/builtin/"; + private static final String RESOURCE_DIRECTORY = "./src/test/resources/datasets/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinGloVeTest.class.getSimpleName() + "/"; + + private final static Types.ValueType[] schema = {Types.ValueType.STRING}; + + private static final int TOP_K = 5; + private static final double ACCURACY_THRESHOLD = 0.85; + + private static final double seed = 45; + private static final double vector_size = 100; + private static final double alpha = 0.75; + private static final double eta = 0.05; + private static final double x_max = 100; + private static final double tol = 1e-4; + private static final double iterations = 10000; + private static final double print_loss_it = 100; + private static final double maxTokens = 2600; + private static final double windowSize = 15; + private static final String distanceWeighting = "TRUE"; + private static final String symmetric = "TRUE"; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME, + new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"out_result"})); + } + + @Test + public void gloveTest() throws IOException{ + // Using top-5 words for similarity comparison + runGloVe(TOP_K); + + // Read the computed similarity results from SystemDS + FrameBlock computedSimilarity = readDMLFrameFromHDFS("out_result", FileFormat.CSV); + + // Load expected results (precomputed in Python) + FrameBlock expectedSimilarity = readDMLFrameFromHDFS(RESOURCE_DIRECTORY + "/GloVe/gloveExpectedTop10.csv", FileFormat.CSV, false); + + // Compute accuracy by comparing computed and expected results + double accuracy = computeAccuracy(computedSimilarity, expectedSimilarity, TOP_K); + + System.out.println("Computed Accuracy: " + accuracy); + + // Ensure accuracy is above a reasonable threshold + assert accuracy > ACCURACY_THRESHOLD : "Accuracy too low! Expected > 85% match."; + } + + public void runGloVe(int topK) { + // Load test configuration + Types.ExecMode platformOld = setExecMode(Types.ExecType.CP); + try { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + + String HOME = SCRIPT_DIR + TEST_DIR; + + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + + programArgs = new String[] { + "-nvargs", + "input=" + RESOURCE_DIRECTORY + "20news/20news_subset_untokenized.csv", + "seed=" + seed, + "vector_size=" + vector_size, + "alpha=" + alpha, + "eta=" + eta, + "x_max=" + x_max, + "tol=" + tol, + "iterations=" + iterations, + "print_loss_it=" + print_loss_it, + "maxTokens=" + maxTokens, + "windowSize=" + windowSize, + "distanceWeighting=" + distanceWeighting, + "symmetric=" + symmetric, + "topK=" + topK, + "out_result=" + output("out_result") + }; + + System.out.println("Running DML script..."); + runTest(true, false, null, -1); + System.out.println("Test execution completed."); + } finally { + rtplatform = platformOld; + } + } + + /** + * Computes accuracy by comparing top-K similar words between computed and expected results. + */ + private double computeAccuracy(FrameBlock computed, FrameBlock expected, int k) { + int count = 0; + for (int i = 0; i < computed.getNumRows(); i++) { + int matchCount = 0; + for (int j = 1; j < k; j++) { + String word1 = computed.getString(i, j); + for (int m = 0; m < k; m++) { + if (Objects.equals(word1, expected.getString(i, m))) { + matchCount++; + break; + } + } + } + if (matchCount > 0) { + count++; + } + } + return (double) count / computed.getNumRows(); + } +} From 5f2321c4df741b963ee859208ef234ea0155dc95 Mon Sep 17 00:00:00 2001 From: Xixuan Zhang <119933243+xixuanzhang2022@users.noreply.github.com> Date: Tue, 4 Feb 2025 08:42:22 +0100 Subject: [PATCH 9/9] [SYSTEMDS-3179] Delete scripts/builtin/gloveValidityTest.dml Deleted the script for glove test since it is located in the wrong folder. Corresponding test added to the right directory. --- scripts/builtin/gloveValidityTest.dml | 124 -------------------------- 1 file changed, 124 deletions(-) delete mode 100644 scripts/builtin/gloveValidityTest.dml diff --git a/scripts/builtin/gloveValidityTest.dml b/scripts/builtin/gloveValidityTest.dml deleted file mode 100644 index 3b254a1bdfc..00000000000 --- a/scripts/builtin/gloveValidityTest.dml +++ /dev/null @@ -1,124 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -cosine_similarity = function(matrix[double] trained_emb) - return (matrix[double] cosine_sim){ - /* - * Computes cosine similarity between word embeddings. - * - * Inputs: - * - trained_emb: Matrix of word embeddings of shape (N, D), where N is the number of words and D is the embedding dimension. - * - * Outputs: - * - cosine_sim: Matrix of cosine similarity scores between word embeddings, shape (N, N). - */ - dot_product = trained_emb %*% t(trained_emb); - row_norms = rowSums(trained_emb^2) ^ 0.5; - denominator = row_norms %*% t(row_norms); - cosine_sim = dot_product / denominator; -} - -get_top = function(matrix[double] trained_emb, int k) - return (matrix[double] topN_indices, matrix[double] topN_values){ - /* - * Retrieves the top k most similar word vectors for each word. - * - * Inputs: - * - trained_emb: Matrix of word embeddings of shape (N, D). - * - k: Number of top similar words to retrieve. - * - * Outputs: - * - topN_indices: Matrix containing indices of the top k most similar words for each word, shape (N, k). - * - topN_values: Matrix containing similarity values of the top k most similar words for each word, shape (N, k). - */ - S = cosine_similarity(trained_emb); - n = nrow(S); - I = diag(matrix(1, rows=nrow(S), cols=1)); - S = S * (1 - I) + (-1.0 * I); - - # Get the sorted indices (descending order) of each row - topN_indices = matrix(0, rows=n, cols=k); - topN_values = matrix(0, rows=n, cols=k); - - for (i in 1:n){ - Scol = S[,i]; - topN = order(target=Scol, by=1, decreasing=TRUE, index.return=TRUE); - topNval = order(target=Scol, by=1, decreasing=TRUE, index.return=FALSE); - - topN = topN[1:k,]; - topv = topNval[1:k,]; - - topN_indices[i,] = t(topN); - topN_values[i,] = t(topv); - } -} - -accuracy = function(frame[unknown] X, frame[unknown] Y, int k) - return (double acc){ - /* - * Computes accuracy based on the overlap of the top-k nearest neighbors between two sets of word embeddings. - * - * Inputs: - * - X: First data frame containing a word column and embedding columns (shape: N x (D+1)). - * - Y: Second data frame containing a word column and embedding columns (shape: N x (D+1)). - * - * Outputs: - * - acc: Scalar value representing the accuracy as the fraction of words where the top-k nearest neighbors match between X and Y. - */ - - # Extract only the embedding values, excluding the word column - X = as.matrix(X[,2:ncol(X)]); - Y = as.matrix(Y[,2:ncol(Y)]); - - [A, Aval] = get_top(X, k); - [B, Bval] = get_top(Y, k); - - count = 0; - for (i in 1:nrow(A)){ - # Select a specific row from both matrices (change row indices as needed) - row1 = A[i,]; # First row of A (1 x m) - row2 = B[i,]; # First row of B (1 x n) - - # Expand both rows to create a pairwise comparison matrix - row1_expanded = t(row1) %*% matrix(1, rows=1, cols=ncol(row2)); # (m x n) - row2_expanded = matrix(1, rows=ncol(row1), cols=1) %*% row2; # (m x n) - - # Element-wise comparison to find matches - matches = (row1_expanded == row2_expanded); # (m x n) Boolean matrix - - # Reduce to find unique elements that exist in both rows - overlap_mask = rowSums(matches) > 0; # (1 x m) mask of matched elements - common_elements = row1 * t(overlap_mask); # Retain only matching elements - - # Remove zeros (non-matching elements set to 0) - filtered_common_elements = removeEmpty(target=common_elements, margin="cols"); - - if (as.scalar(filtered_common_elements[1,1])!= 0){ - count = count+1; - } - } - acc = count/nrow(A); #output -} - -#X = read($1, format="csv", sep = " ", header=FALSE, data_type="frame"); -#Y = read($2, format="csv", sep = " ", header=FALSE, data_type="frame"); -#k = 10; -#accuracy(X,Y,k);