-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmovielens_hx_c9.R
554 lines (425 loc) · 20.7 KB
/
movielens_hx_c9.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
#NOTE: While (almost) all of the code below appears in the associated Rmd and PDF files,
#the exact order in which each code chunk appears has been altered in the final versions for clarity
#sake. Similarly, while all of the code is documented as to its purpose here, the exact wording has been
#updated/refined for ease of reading in the R Markdown and PDF formats.
#PREAMBLE CODE (downloading/reading dataset and making the edx and validation datasets)
#Setting up the edx and validation datasets (CODE PROVIDED BY COURSE)
if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org")
if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org")
if(!require(data.table)) install.packages("data.table", repos = "http://cran.us.r-project.org")
library(tidyverse)
library(caret)
library(data.table)
# MovieLens 10M dataset:
# https://grouplens.org/datasets/movielens/10m/
# http://files.grouplens.org/datasets/movielens/ml-10m.zip
dl <- tempfile()
download.file("https://files.grouplens.org/datasets/movielens/ml-10m.zip", dl)
ratings <- fread(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),
col.names = c("userId", "movieId", "rating", "timestamp"))
movies <- str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)
colnames(movies) <- c("movieId", "title", "genres")
# if using R 4.0 or later:
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(movieId),
title = as.character(title),
genres = as.character(genres))
movielens <- left_join(ratings, movies, by = "movieId")
# Validation set will be 10% of MovieLens data
set.seed(1, sample.kind="Rounding") # if using R 3.5 or earlier, use `set.seed(1)`
test_index <- createDataPartition(y = movielens$rating, times = 1, p = 0.1, list = FALSE)
edx <- movielens[-test_index,]
temp <- movielens[test_index,]
# Make sure userId and movieId in validation set are also in edx set
validation <- temp %>%
semi_join(edx, by = "movieId") %>%
semi_join(edx, by = "userId")
# Add rows removed from validation set back into edx set
removed <- anti_join(temp, validation)
edx <- rbind(edx, removed)
rm(dl, ratings, movies, test_index, temp, movielens, removed)
#END OF PREAMBLE CODE
#EXPLORING THE EDX AND VALIDATION SETS
#Installing (if necessary) and calling the lubridate library.
if(!require(lubridate)) install.packages("lubridate", repos = "http://cran.us.r-project.org")
library(lubridate)
#Confirming the dimensions of the edx dataset
dim(edx) #This should be approx. 9M rows and 6 columns
#Confirming the dimensions of the validation dataset
dim(validation) #This should be approximately 1M rows and 6 columns.
#Looking at the start of the edx dataset
head(edx)
#Checking for missing values in edx or validation
anyNA(edx)
anyNA(validation)
#Getting a basic numeric summary of the movie ratings
summary(edx$rating)
#Checking how many unique users gave at least 1 review and how many movies have been rated.
edx %>%
summarize(number_of_raters = n_distinct(userId), #there are 69878 raters.
number_of_movies = n_distinct(movieId)) #there are 10677 movies.
#Checking the number of unique movie genres
length(unique(edx$genres)) #there are 797 difference genre listings.
#Checking the frequency of each rating
edx %>% group_by(rating) %>%
summarize(count = n()) %>%
arrange(desc(count)) #this is ordered in terms of most common to least common rating
#Visualizing the proportions of each rating with a bar chart.
edx %>%
ggplot(aes(rating, y = ..prop..)) +
geom_bar(color = "black", fill = "red") +
scale_x_continuous(seq(0, 5, by = 0.5)) +
ggtitle("Proportional Distribution for the Rating") +
labs(x = "Rating", y = "Proportion of Ratings")
#Determining the 15 movies with the largest number of reviews.
most_rated_15 <- edx %>%
group_by(title) %>%
summarize(count = n()) %>%
top_n(15, count) %>%
arrange(desc(count))
#Visualizing the top 15 most rated movies.
most_rated_15 %>%
top_n(15) %>%
ggplot(aes(reorder(title, count), count)) +
geom_bar(color = "black", fill = "red", stat = "identity") +
ggtitle("Most Commonly Rating Films") +
coord_flip(y = c(0, 42000)) +
labs(y = "Number of Ratings", x = "") +
geom_text(aes(label = count), hjust = -0.2, size = 3)
#Determining the movies with 10 or fewer ratings and counting how many
least_rated <- edx %>%
group_by(title) %>%
summarize(count = n()) %>%
filter(count <= 10)
nrow(least_rated) #There are 1139
#Determining the movies with 1000+ ratings and counting how many there are.
most_rated <- edx %>%
group_by(title) %>%
summarize(count = n()) %>%
filter(count >= 1000)
nrow(most_rated)
#Counting the ratings given by each user (and determining summary statistics)
user_rating_count <- edx %>%
group_by(userId) %>%
summarize(count = n())
summary(user_rating_count$count)
#NOTE: The median number of ratings is 62, which is significantly lower than the
#mean (128.8). This suggests that there are some users who rated far more
#movies than the median number.This is better seen by the following visualization.
#Visualizing the distribution of the number of ratings given by each user.
user_rating_count %>%
ggplot(aes(count)) +
geom_histogram(binwidth = 0.2, color = "black", fill = "red", bins = 40) +
scale_x_log10() +
labs(x = "Number of Ratings", y = "Number of users") +
ggtitle("Distribution of the Number of Ratings given by Users")
#NOTE: Users also tend to either rate higher or lower on average (i.e. not right at the average).
#To account for this, consider the following visualization of the average rating given by users.
edx %>%
group_by(userId) %>%
summarize(avg_rating = mean(rating)) %>%
ggplot(aes(avg_rating)) +
geom_histogram(binwidth = 0.2, color = "black", fill = "red", bins = 40) +
labs(x = "Ratings", y = "Number of Users") +
ggtitle("Average Movie Rating given by Users")
#NOTE: There is a general trend to rate higher than a 3.
#PREPARING AND CHECKING THE TRAINING AND TESTING SETS (Coming from Edx dataset)
#Splitting the edx dataset into train and test sets (90% to train, 10% to test)
set.seed(1, sample.kind = "Rounding")
index <- createDataPartition(edx$movieId, times = 1, p = 0.9, list = FALSE)
train_edx <- edx %>%
slice(index)
test_edx <- edx %>%
slice(-index)
#Checking the dimensions of both sets.
dim(train_edx) #This has 8,100,051 rows and 6 columns.
dim(test_edx) #This has 900,004 rows and 6 columns.
#Looking at the beginning of the sets.
head(train_edx)
head(test_edx)
#Removing any users and movies that are in the test_edx set but are not in the train_edx set.
test_edx <- test_edx %>%
semi_join(train_edx, by = "movieId") %>%
semi_join(train_edx, by = "userId")
#Confirming that no NA values have been generated
any(is.na(train_edx))
any(is.na(test_edx))
#BEGINNING OF THE ALGORITHM DEVELOPMENT
#SETTING UP THE RMSE FUNCTION (this is the loss function or the measure of what doing well means)
rmse <- function(true_rating, predicted_rating){
sqrt(mean((true_rating - predicted_rating)^2))
}
#MODEL 1: Just the average (no other factors considered)
#Calculating the average rating for movies (ignores any possible biases)
mu <- mean(train_edx$rating)
mu #This is the average rating of the movies in the training set.
#Calculating the RMSE score based on mu alone.
model1_rmse <- rmse(test_edx$rating, mu)
model1_rmse #This is 1.05931
#Making a table to track (and ultimately compare) RMSE scores across models.
model_comparison <- data.frame(method = "Average Movie Rating ONLY",
RMSE = model1_rmse)
#MODEL 2: Average + movie bias
#Since there is a tendency to rate movies above the average rating, as seen by...
edx %>%
group_by(userId) %>%
summarize(avg_rating = mean(rating)) %>%
ggplot(aes(avg_rating)) +
geom_histogram(binwidth = 0.2, color = "black", fill = "red", bins = 40) +
labs(x = "Ratings", y = "Number of Users") +
ggtitle("Average Movie Rating given by Users")
#The model needs to take account of this as well (recall that mu = 3.512478).
mu <- mean(train_edx$rating) #this is the average rating
#I will now use an approximation of the least square estimate of the user bias (because
#actually computing the estimate would make my computer die or freeze).
movie_bias <- train_edx %>%
group_by(movieId) %>%
summarize(b_i = mean(rating - mu))
#Plotting the movie_bias b_1 scores (distance each movie's rating was from the overall average rating)
movie_bias %>%
ggplot(aes(b_i)) +
geom_histogram(binwidth = 0.2, color = "black", fill = "red", bins = 40) +
labs(x = "Distance from Overall Average Movie Rating",
y = "Number of Movies") +
ggtitle("")
#Predicting the ratings for test_edx
model2_predicted <- mu + test_edx %>%
left_join(movie_bias, by = "movieId") %>%
.$b_i
#Computing the RMSE score for model 2
model2_rmse <- rmse(model2_predicted, test_edx$rating)
model2_rmse #Approximately 0.942
#Adding model2_rmse to the model_comparison table (I'll show the whole table at the end).
model_comparison <- bind_rows(model_comparison,
data.frame(method = "Movie Bias Model",
RMSE = model2_rmse))
#MODEL 3: Average + User Bias
#Since model 2 only considered bias that comes from particular movies being rated higher or lower on average,
#I will now consider the effect that particular user's biases has on the ratings.
#NOTE: mu is still the same as before.
user_bias <- train_edx %>%
group_by(userId) %>%
summarize(b_u = mean(rating - mu))
#NOTE: b_2 is the average difference between the overall movie rating average and that which a particular
#user gives movies.
#Plotting the user_bias b_2 scores.
user_bias %>%
ggplot(aes(b_u)) +
geom_histogram(binwidth = 0.2, color = "black", fill = "red", bins = 40) +
labs(x = "Distance from Overall Average Movie Rating",
y = "Number of Movies") +
ggtitle("")
#This is mostly symmetric about 0 but there does appear to be a slight trend among users
#to rate movies above the overall average rating (which is approximately 3.5).
#Predicting the ratings in the test set based on user bias
model3_predicted <- mu + test_edx %>%
left_join(user_bias, by = "userId") %>%
.$b_u
#Calculating the RMSE for model 3
model3_rmse <- rmse(model3_predicted, test_edx$rating)
model3_rmse #This is approximately 0.978, which is worse than model 2.
#Adding model 3 to the table
model_comparison <- bind_rows(model_comparison,
data.frame(method = "User Bias Model",
RMSE = model3_rmse))
#So, predicting based on movie bias is better than predicting based on user bias, but
#neither is particular good.
#MODEL 4: Average + User bias + Movie bias
#This model will take account of both the user bias (as in model 3) and the movie bias (as in model 2).
#NOTE: This is still an approximation due to technological restraints.
user_movie_biases <- train_edx %>%
left_join(movie_bias, by = "movieId") %>%
group_by(userId) %>%
summarize(b_w = mean(rating - mu - b_i))
#Predicting ratings based on user and movie biases.
model4_predicted <- test_edx %>%
left_join(movie_bias, by = "movieId") %>%
left_join(user_movie_biases, by = "userId") %>%
mutate(prediction = mu + b_i + b_w) %>%
.$prediction
#Computing the RMSE for model 4
model4_rmse <- rmse(model4_predicted, test_edx$rating)
model4_rmse #This is approximately 0.8645122.
#Adding model 4 to the table.
model_comparison <- bind_rows(model_comparison,
data.frame(method = "Movie + User Bias Model",
RMSE = model4_rmse))
#So, We now have an RMSE of 0.8645122, which is a significant improvment.
#CHECKING WHAT THE MODEL COMPARISONS CURRENTLY LOOKS LIKE
model_comparison %>% knitr:: kable()
#MODEL 5: Regularized Model 4
#I will use the tuning parameter lambda to penalize the films that received very few reviews
#as well as the users who only reviewed a handful of films. Since the optimal lambda is unknown,
#I will try multiple values.
lambdas <- seq(0, 10, 0.25)
#Computing the RMSE values based on the range of lambda values
rmse_regularized <- sapply(lambdas, function(l){
mu <- mean(train_edx$rating)
b_i <- train_edx %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu) / (n() + l)) #this is movie bias.
b_u <- train_edx %>%
left_join(b_i, by = "movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu) / (n() + l)) #this is movie + user bias.
predicted_rating <- test_edx %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(prediction = mu + b_i + b_u) %>%
.$prediction
return(rmse(predicted_rating, test_edx$rating))
})
#Showing the lambdas compared to their rmse scores.
qplot(lambdas, rmse_regularized)
#Determining the lambda with the smallest RMSE
lambda <- lambdas[which.min(rmse_regularized)]
lambda #This is 5.5
#Applying the optimal lambda to get the best RMSE score
model5_rmse <- min(rmse_regularized)
model5_rmse #This is approx. 0.8638, which is below the required threshold.
#Adding model 5 to the table
model_comparison <- bind_rows(model_comparison,
data.frame(method = "Regularized User + Movie Bias Model",
RMSE = model5_rmse))
#CHECKING WHAT THE MODEL COMPARISONS CURRENTLY LOOKS LIKE
model_comparison %>% knitr:: kable()
#MODEL 6: Regularized User + Movie + Time Bias Model
#In this model, I will also consider the date of each rating (i.e. the time stamp data).
#To do this, I will convert the time stamp data into a more user-friendly format.
#I will also provide visualizations of the average rating per year.
#The dates will then be grouped by year and incorporated into model 4 in order to make
# a regularized model for User + Movie + Time bias.
#Making a new data version of train_edx that includes the converted dates and rounds them
#to the nearest month.
train_edx_norm_dates <- train_edx %>%
mutate(round_dates = round_date(as_datetime(timestamp), unit = "month"))
test_edx_norm_dates <- test_edx %>%
mutate(round_dates = round_date(as_datetime(timestamp), unit = "month"))
#Plotting the average rating over time (based on monthly average rating.)
train_edx_norm_dates %>%
group_by(round_dates) %>%
summarize(average_rating = mean(rating)) %>%
ggplot(aes(round_dates, average_rating)) +
geom_point() +
xlab("Date") +
ylab("Average Movie Rating") +
ggtitle("Average Movie Ratings over Time")
#The plot shows that (besides one rating in 1995), the average rating is slightly higher around
#1997 to 2001, but the increase is not especially large. So, while the inclusion of
#time as a possible source of bias should improve the model, the amount of improvement is
#likely to be quite small.
lambdas <- seq(0, 10, 0.25)
#Computing the RMSE values based on the range of lambda values
rmse_regularized_2 <- sapply(lambdas, function(l){
mu <- mean(train_edx$rating)
b_i <- train_edx_norm_dates %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu) / (n() + l)) #this is movie bias.
b_u <- train_edx_norm_dates %>%
left_join(b_i, by = "movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu) / (n() + l)) #this is movie + user bias.
b_t <- train_edx_norm_dates %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
group_by(round_dates) %>%
summarize(b_t = sum(rating - b_i - b_u - mu) / (n() + l))
predicted_rating <- test_edx_norm_dates %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
left_join(b_t, by = "round_dates") %>%
mutate(prediction = mu + b_i + b_u + b_t) %>%
.$prediction
return(rmse(predicted_rating, test_edx$rating))
})
#Showing the lambdas compared to their rmse scores.
qplot(lambdas, rmse_regularized_2)
#Determining the lambda with the smallest RMSE
lambda <- lambdas[which.min(rmse_regularized_2)]
lambda #This is 5.75.
#Applying the optimal lambda to get the best RMSE score
model6_rmse <- min(rmse_regularized_2)
model6_rmse #This is approx. 0.86377, which is a mild improvement on model 5.
#Adding model 6 to the table
model_comparison <- bind_rows(model_comparison,
data.frame(method = "Regularized User + Movie + Time Bias Model",
RMSE = model6_rmse))
#MODEL 7: Using the Recosystem package
#Installing the recosystem package if necessary.
if(!require(recosystem)) install.packages("recosystem", repos = "http://cran.us.r-project.org")
#Calling the recosystem package.
library(recosystem)
#NOTE: The recosystem is based on matrix factorization and requires that
#sets be formatted as a sparse matrix triplet. So, the first step is to format the train_edx
#and test_edx sets.
#NOTE: Since this will not need the timestamp information, I will go back to using train_edx
#and test_edx instead of train_edx_norm_dates and test_edx_norm_dates.
train_edx_sparMatTrip <- with(train_edx,
data_memory(user = userId, item_index = movieId, rating = rating))
test_edx_sparMatTrip <- with(test_edx,
data_memory(user = userId, item_index = movieId, rating = rating))
#NOTE: The movieIds are being used to index the elements in the matrix.
#Making an r object of class "Recosys" (to be used subsequently)
rec_obj <- Reco()
#Tuning the recosys object (opts is how I am controlling the details).
#dim is the number of latent factors in the data.
#costp_12 is the regularization cost for user factors.
#costq_12 is the regularization cost for item_index factors.
#lrate is the rate at which the gradient will decline.
#niter is the number of iterations.
#NOTE: p and q are in reference to user and item_index in general, respectively.
#WARNING!!!: I do NOT recommend actually running this unless your computer has 32GB+ of RAM.
#I ran it on a 16GB, Intel i7 PC and it took over 30 minutes to run.
rec_tuner <- rec_obj$tune(train_edx_sparMatTrip,
opts = list(dim = c(10, 30),
costp_12 = c(0.01, 0.1),
costq_12 = c(0.01, 0.1),
lrate = c(0.01, 0.1),
niter = 15))
#Training the model
rec_obj$train(train_edx_sparMatTrip,
opts = c(rec_tuner$min, nthread = 4, niter = 30))
#Making the predictions
rec_predictions <- rec_obj$predict(test_edx_sparMatTrip, out_memory())
#Calculating the rmse
model7_rmse <- rmse(rec_predictions, test_edx$rating)
model7_rmse #this is approx. 0.785, which is by far the best model.
#Adding model 7 to list of models
model_comparison <- bind_rows(model_comparison,
data.frame(method = "Recosystem Matrix Factorization",
RMSE = model7_rmse))
#Showing what all of the models look like based on Edx dataset.
model_comparison %>% knitr::kable()
#MODEL 8: Final Version (same idea as model 7 but with edx and validation sets)
#Making Sparse Matrix Triplet versions of Edx and Validation.
edx_sparMatTrip <- with(edx, data_memory(user = userId, item_index = movieId, rating = rating))
valid_sparMatTrip <- with(validation, data_memory(user = userId, item_index = movieId, rating = rating))
#Making an r object of class "Recosys" (to be used subsequently)
rec_obj_valid <- Reco()
#Tuning the recosys object rec_obj_valid
#WARNING!!!: I do NOT recommend actually running this unless your computer has 32GB+ of RAM.
#I ran it on a 16GB, Intel i7 PC and it took over 60 minutes to run.
rec_tuner <- rec_obj_valid$tune(edx_sparMatTrip,
opts = list(dim = c(10, 30),
costp_12 = c(0.01, 0.1),
costq_12 = c(0.01, 0.1),
lrate = c(0.01, 0.1),
niter = 15))
#Training the model
rec_obj_valid$train(edx_sparMatTrip,
opts = c(rec_tuner$min, nthread = 4, niter = 30))
#Making the predictions
rec_predictions <- rec_obj_valid$predict(valid_sparMatTrip, out_memory())
#Calculating the rmse
model8_rmse <- rmse(rec_predictions, validation$rating)
model8_rmse #This is approx. 0.7809.
#Adding the final model to the table of model comparisons
model_comparison <- bind_rows(model_comparison,
data.frame(method = "Final Model (Recosystem)",
RMSE = model8_rmse))
#Reviewing all of the models
model_comparison %>% knitr::kable()
#Sources used
#https://cran.r-project.org/web/packages/recosystem/recosystem.pdf
#https://github.com/yixuan/recosystem
#https://rdrr.io/cran/recosystem/man/tune.html
#Book: R for Everyone: Advanced Analytics and Graphics (Jared P. Lander)