Project 1
Briefly describe the recommender system that you’re going to build out from a business perspective, e.g. “This system recommends data science books to readers.”
This system recommends movies to Users
Find a dataset, or build out your own toy dataset. As a minimum requirement for complexity, please include numeric ratings for at least five users, across at least five items, with some missing data.
I created a dataset with 7 movies for 20 users. The Dataset is located here:- https://raw.githubusercontent.com/ashishsm1986/Cuny-Assignments/master/Data-612/Week1/ashish-movies.csv
Load your data into (for example) an R or pandas dataframe, a Python dictionary or list of lists, (or another data structure of your choosing). From there, create a user-item matrix.
ratings_dataset <- as.data.frame(read.csv('https://raw.githubusercontent.com/ashishsm1986/Cuny-Assignments/master/Data-612/Week1/ashish-movies.csv', stringsAsFactors = FALSE))
ratings_dataset
## userId Toy.Story Jumanji Interstellar GoldenEye American.President Turning
## 1 1 3.5 3.0 4.0 3.0 5.0 4.5
## 2 2 4.0 3.0 3.0 2.0 3.0 4.0
## 3 3 3.5 3.5 5.0 5.0 4.0 2.0
## 4 4 3.0 4.0 3.0 4.0 3.0 NA
## 5 5 2.5 2.5 4.0 4.5 4.5 3.5
## 6 6 2.5 3.0 3.5 3.5 3.0 4.0
## 7 7 4.0 3.0 5.0 NA 4.0 4.5
## 8 8 3.0 5.0 4.0 4.0 3.0 5.0
## 9 9 4.0 4.0 3.0 2.0 5.0 5.0
## 10 10 4.0 5.0 2.0 4.0 5.0 4.0
## 11 11 3.0 1.5 4.5 2.0 4.0 4.5
## 12 12 4.0 3.0 1.0 3.0 3.0 4.0
## 13 13 3.0 3.5 5.0 3.0 4.0 4.0
## 14 14 3.5 3.0 2.0 3.0 4.0 3.0
## 15 15 2.0 2.0 4.0 4.0 5.0 2.0
## 16 16 4.0 4.0 2.0 1.0 2.0 3.0
## 17 17 4.5 4.0 3.0 3.0 4.0 2.0
## 18 18 5.0 3.5 3.0 4.0 3.0 3.0
## 19 19 4.0 4.0 2.0 4.0 3.0 4.0
## 20 20 4.0 4.0 5.0 4.0 4.0 2.0
## The.Martian
## 1 3.0
## 2 2.0
## 3 2.5
## 4 5.0
## 5 1.5
## 6 2.0
## 7 2.0
## 8 2.0
## 9 3.5
## 10 3.0
## 11 4.0
## 12 2.0
## 13 5.0
## 14 3.0
## 15 3.0
## 16 4.0
## 17 2.0
## 18 2.0
## 19 2.0
## 20 4.0
Break your ratings into separate training and test datasets.
Assigning 70% of data to Train and 30% to Test
set.seed(5)
indexes <- sample(seq_len(nrow(ratings_dataset)), size = floor(.70*nrow(ratings_dataset)))
trainset <- ratings_dataset[indexes, ]
testset <- ratings_dataset[-indexes, ]
trainset
## userId Toy.Story Jumanji Interstellar GoldenEye American.President Turning
## 2 2 4.0 3.0 3.0 2.0 3.0 4.0
## 11 11 3.0 1.5 4.5 2.0 4.0 4.5
## 15 15 2.0 2.0 4.0 4.0 5.0 2.0
## 19 19 4.0 4.0 2.0 4.0 3.0 4.0
## 9 9 4.0 4.0 3.0 2.0 5.0 5.0
## 16 16 4.0 4.0 2.0 1.0 2.0 3.0
## 5 5 2.5 2.5 4.0 4.5 4.5 3.5
## 7 7 4.0 3.0 5.0 NA 4.0 4.5
## 13 13 3.0 3.5 5.0 3.0 4.0 4.0
## 3 3 3.5 3.5 5.0 5.0 4.0 2.0
## 17 17 4.5 4.0 3.0 3.0 4.0 2.0
## 6 6 2.5 3.0 3.5 3.5 3.0 4.0
## 20 20 4.0 4.0 5.0 4.0 4.0 2.0
## 12 12 4.0 3.0 1.0 3.0 3.0 4.0
## The.Martian
## 2 2.0
## 11 4.0
## 15 3.0
## 19 2.0
## 9 3.5
## 16 4.0
## 5 1.5
## 7 2.0
## 13 5.0
## 3 2.5
## 17 2.0
## 6 2.0
## 20 4.0
## 12 2.0
## userId Toy.Story Jumanji Interstellar GoldenEye American.President Turning
## 1 1 3.5 3.0 4 3 5 4.5
## 4 4 3.0 4.0 3 4 3 NA
## 8 8 3.0 5.0 4 4 3 5.0
## 10 10 4.0 5.0 2 4 5 4.0
## 14 14 3.5 3.0 2 3 4 3.0
## 18 18 5.0 3.5 3 4 3 3.0
## The.Martian
## 1 3
## 4 5
## 8 2
## 10 3
## 14 3
## 18 2
Using your training data, calculate the raw average (mean) rating for every user-item combination.
raw_average <- (sum(trainset$Toy.Story, na.rm = TRUE) +
sum(trainset$Jumanji, na.rm = TRUE) +
sum(trainset$Interstellar, na.rm = TRUE) +
sum(trainset$GoldenEye, na.rm = TRUE) +
sum(trainset$American.President, na.rm = TRUE) +
sum(trainset$Turning, na.rm = TRUE) +
sum(trainset$The.Martian, na.rm = TRUE)
) / (sum(!is.na(trainset$Toy.Story)) +
sum(!is.na(trainset$Jumanji)) +
sum(!is.na(trainset$Interstellar)) +
sum(!is.na(trainset$GoldenEye)) +
sum(!is.na(trainset$American.President)) +
sum(!is.na(trainset$Turning)) +
sum(!is.na(trainset$The.Martian)))
raw_average
## [1] 3.35567
Raw Average for training dataset is 3.3556701
Calculate the RMSE for raw average for both your training data and your test data.
For training data
sum = 0
count = 0
for (i in 1:nrow(trainset)) {
for (j in 2:ncol(trainset)) { # Column 1 is just userid
if (!is.na(trainset[i,j])) {
count = count +1
sum = sum + (trainset[i,j]-raw_average)^2
}
}
}
trainset_RMSE <- round(sqrt(sum/count),3)
trainset_RMSE
## [1] 1.023
Raw RMSE for Training dataset is 1.023
For test data
sum = 0
count = 0
for (i in 1:nrow(testset)) {
for (j in 2:ncol(testset)) {
if (!is.na(testset[i,j])) {
count = count +1
sum = sum + (testset[i,j]-raw_average)^2
}
}
}
testset_RMSE <- round(sqrt(sum/count),3)
testset_RMSE
## [1] 0.908
Raw RMSE for Test dataset is 0.908
Using your training data, calculate the bias for each user and each item.
Bias for users
user_bias = data.frame(userId = trainset[,1], bias = 0)
for (i in 1:nrow(user_bias)) {
user_bias[i,2] = sum(trainset[i,2:ncol(trainset)], na.rm = TRUE)/sum(!is.na(trainset[i,2:ncol(trainset)])) - raw_average
}
user_bias
## userId bias
## 1 2 -0.355670103
## 2 11 0.001472754
## 3 15 -0.212812960
## 4 19 -0.069955817
## 5 9 0.430044183
## 6 16 -0.498527246
## 7 5 -0.069955817
## 8 7 0.394329897
## 9 13 0.572901325
## 10 3 0.287187040
## 11 17 -0.141384389
## 12 6 -0.284241532
## 13 20 0.501472754
## 14 12 -0.498527246
Bias for movies
movie_bias <- data.frame(movie = as.data.frame(colnames(trainset)), bias = 0)
movie_bias <- movie_bias[-1,] # removing userid column
colnames(movie_bias) <- c('movie','bias')
for (i in 1:nrow(movie_bias)) {
movie_bias[i,2] = sum(trainset[1:nrow(trainset),i+1], na.rm = TRUE)/sum(!is.na(trainset[1:nrow(trainset),i+1])) - raw_average
}
movie_bias
## movie bias
## 2 Toy.Story 0.1443299
## 3 Jumanji -0.1413844
## 4 Interstellar 0.2157585
## 5 GoldenEye -0.2018239
## 6 American.President 0.3943299
## 7 Turning 0.1086156
## 8 The.Martian -0.5342415
From the raw average, and the appropriate user and item biases, calculate the baseline predictors for every user-item combination.
predictors_trainset <- trainset
for (i in 1:nrow(predictors_trainset)) {
for (j in 2:ncol(predictors_trainset)) {
predictors_trainset[i,j] = round(raw_average + user_bias[i,2] + movie_bias[j-1,2],2)
}
}
#clip values at 5 and 1 for those greater than 5 or less than 1
for (i in 1:nrow(predictors_trainset)) {
for (j in 2:ncol(predictors_trainset)) {
if(predictors_trainset[i,j] > 5) # Upper limit 5 and lower limit 1
predictors_trainset[i,j] = 5
if(predictors_trainset[i,j] <1)
predictors_trainset[i,j] = 1
}
}
predictors_trainset
## userId Toy.Story Jumanji Interstellar GoldenEye American.President Turning
## 2 2 3.14 2.86 3.22 2.80 3.39 3.11
## 11 11 3.50 3.22 3.57 3.16 3.75 3.47
## 15 15 3.29 3.00 3.36 2.94 3.54 3.25
## 19 19 3.43 3.14 3.50 3.08 3.68 3.39
## 9 9 3.93 3.64 4.00 3.58 4.18 3.89
## 16 16 3.00 2.72 3.07 2.66 3.25 2.97
## 5 5 3.43 3.14 3.50 3.08 3.68 3.39
## 7 7 3.89 3.61 3.97 3.55 4.14 3.86
## 13 13 4.07 3.79 4.14 3.73 4.32 4.04
## 3 3 3.79 3.50 3.86 3.44 4.04 3.75
## 17 17 3.36 3.07 3.43 3.01 3.61 3.32
## 6 6 3.22 2.93 3.29 2.87 3.47 3.18
## 20 20 4.00 3.72 4.07 3.66 4.25 3.97
## 12 12 3.00 2.72 3.07 2.66 3.25 2.97
## The.Martian
## 2 2.47
## 11 2.82
## 15 2.61
## 19 2.75
## 9 3.25
## 16 2.32
## 5 2.75
## 7 3.22
## 13 3.39
## 3 3.11
## 17 2.68
## 6 2.54
## 20 3.32
## 12 2.32
Calculate the RMSE for the baseline predictors for both your training data and your test data.
For Train Dataset
sum = 0
count = 0
for (i in 1:nrow(trainset)) {
for (j in 2:ncol(trainset)) {
if (!is.na(trainset[i,j])) {
count = count +1
sum = sum + (trainset[i,j]-predictors_trainset[i,j])^2
}
}
}
baseline_trainset_RMSE <- round(sqrt(sum/count),3)
User bias for test set
user_bias_testset = data.frame(userId = testset[,1], bias = 0)
for (i in 1:nrow(user_bias_testset)) {
user_bias_testset[i,2] = sum(testset[i,2:ncol(testset)], na.rm = TRUE)/sum(!is.na(testset[i,2:ncol(testset)])) - raw_average
}
user_bias_testset
## userId bias
## 1 1 0.358615611
## 2 4 0.310996564
## 3 8 0.358615611
## 4 10 0.501472754
## 5 14 -0.284241532
## 6 18 0.001472754
Prediction for Test
predictors_testset <- testset
for (i in 1:nrow(predictors_testset)) {
for (j in 2:ncol(predictors_testset)) {
predictors_testset[i,j] = round(raw_average + user_bias_testset[i,2] + movie_bias[j-1,2],2)
}
}
#clip values at 5 and 1 for those greater than 5 or less than 1
for (i in 1:nrow(predictors_testset)) {
for (j in 2:ncol(predictors_testset)) {
if(predictors_testset[i,j] > 5)
predictors_testset[i,j] = 5
if(predictors_testset[i,j] <1)
predictors_testset[i,j] = 1
}
}
predictors_testset
## userId Toy.Story Jumanji Interstellar GoldenEye American.President Turning
## 1 1 3.86 3.57 3.93 3.51 4.11 3.82
## 4 4 3.81 3.53 3.88 3.46 4.06 3.78
## 8 8 3.86 3.57 3.93 3.51 4.11 3.82
## 10 10 4.00 3.72 4.07 3.66 4.25 3.97
## 14 14 3.22 2.93 3.29 2.87 3.47 3.18
## 18 18 3.50 3.22 3.57 3.16 3.75 3.47
## The.Martian
## 1 3.18
## 4 3.13
## 8 3.18
## 10 3.32
## 14 2.54
## 18 2.82
Test RMSE Calculation
sum = 0
count = 0
for (i in 1:nrow(testset)) {
for (j in 2:ncol(testset)) {
if (!is.na(testset[i,j])) {
count = count +1
sum = sum + (testset[i,j]-predictors_testset[i,j])^2
}
}
}
baseline_testset_RMSE <- round(sqrt(sum/count),3)
RMSE of baseline predictors for training data 0.915 and RMSE of baseline predictors for Test data 0.847
Summarize your results.
Both Test and Train RMSE improved with the addition of the movie bias and user bias compared to just the raw average. The improvement for test was roughly 6.72%, and The improvement for train was roughly 10.56% which isn’t a huge increase and probably due to the small subset with the movies chosen. The results could possibly be improved by using a larger variety of movies and bigger dataset.