Project 1

Load Data

Load the the ratings and movie title datasets

ratings_import <- read.csv("https://raw.githubusercontent.com/dhairavc/DATA612-RecommenderSystems/master/Project1/ratings.csv")[,1:3]
movie_titles <- read.csv("https://raw.githubusercontent.com/dhairavc/DATA612-RecommenderSystems/master/Project1/movies.csv")[,1:2]

Crease Dense Subset

For educational purposes create a smaller dense dataset of 5 randomly selected movies with the most reviews and 12 random users with a max 2 movies not rated per user

# Create a subset of random 5 movies which have grather than 150 reviews
m_filter <- ratings_import %>% select(movieId) %>% group_by(movieId) %>% summarise("RatingCount" = n()) %>% arrange(desc(RatingCount)) %>% filter(RatingCount > 150)
m_filter <- m_filter[sample(nrow(m_filter), 5),]

# Create subset of 12 random users which have rated atleast 3 of the 5 movies selected 
u_filter <- ratings_import %>% filter(movieId %in% m_filter$movieId) %>% select(userId) %>% group_by(userId) %>% summarise("Rated" = n()) %>% arrange(desc(Rated)) %>% filter(Rated %in% c(3,4,5))
rating_sample <- ratings_import %>% filter(movieId %in% m_filter$movieId) %>% filter(userId %in% sample(u_filter$userId,12))

# Join movie titles and remove movie Ids
rating_sample <- left_join(rating_sample, movie_titles, by = "movieId")
rating_sample <- rating_sample %>% select(-movieId) 


ratings <- spread(rating_sample, title, rating)
row.names(ratings) <- c(ratings$userId)
ratings <- ratings[,2:6]
ratings %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)

	Men in Black (a.k.a. MIB) (1997)	Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)	Saving Private Ryan (1998)	Schindler’s List (1993)	Star Wars: Episode V - The Empire Strikes Back (1980)
17	NA	4.5	4.5	4.5	5.0
18	3.5	4.0	4.0	4.5	4.0
45	3.0	4.0	5.0	NA	5.0
195	NA	NA	3.0	4.0	4.0
199	4.0	4.0	4.0	5.0	3.5
266	3.0	5.0	4.0	NA	5.0
274	3.0	4.0	4.0	4.0	4.5
282	4.0	NA	4.5	5.0	NA
354	3.5	NA	3.5	4.0	4.0
494	NA	5.0	5.0	NA	5.0
570	4.0	4.0	4.0	4.5	4.0
600	NA	4.0	NA	3.0	4.0

Split Datasets

Split the existing dataset into training and testing datasets

n <- nrow(ratings)
p_train <- .6
rowsintrain <- floor(n*p_train)
randomize_order <- ratings[sample(n),]
train_ratings <- randomize_order[1:rowsintrain,]
test_ratings <- randomize_order[(rowsintrain+1):n,]

#training dataset
train_ratings %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)

	Men in Black (a.k.a. MIB) (1997)	Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)	Saving Private Ryan (1998)	Schindler’s List (1993)	Star Wars: Episode V - The Empire Strikes Back (1980)
18	3.5	4.0	4.0	4.5	4.0
17	NA	4.5	4.5	4.5	5.0
274	3.0	4.0	4.0	4.0	4.5
354	3.5	NA	3.5	4.0	4.0
600	NA	4.0	NA	3.0	4.0
195	NA	NA	3.0	4.0	4.0
570	4.0	4.0	4.0	4.5	4.0

#testing dataset
test_ratings %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)

	Men in Black (a.k.a. MIB) (1997)	Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)	Saving Private Ryan (1998)	Schindler’s List (1993)	Star Wars: Episode V - The Empire Strikes Back (1980)
282	4	NA	4.5	5	NA
199	4	4	4.0	5	3.5
266	3	5	4.0	NA	5.0
45	3	4	5.0	NA	5.0
494	NA	5	5.0	NA	5.0

Raw Average

Compute the raw average mean

raw_average <- mean(as.matrix(train_ratings), na.rm = TRUE)
raw_average

## [1] 3.982759

RMSE Function

Create function to calculate the Root Mean Squared Error (RMSE)

RMSE_calc <- function(df, q) {
  sqrt(mean(as.matrix(((df-q)^2)),na.rm = TRUE))
}

RMSE Calculation

Calculate the RMSE RMSE for the raw average for both testing and training datasets

train_RMSE <- RMSE_calc(train_ratings, raw_average)
test_RMSE <- RMSE_calc(test_ratings, raw_average)

train_RMSE

## [1] 0.4639181

test_RMSE

## [1] 0.786298

Baseline Prediction for Training Dataset

Calculate user and movie bias and calculate the baseline predictions for the training dataset

user_bias <- rowMeans(train_ratings, na.rm = TRUE) - raw_average
movie_bias <- colMeans(train_ratings, na.rm = TRUE) - raw_average

train_ratings_pred <- data.frame(matrix(rep(raw_average, nrow(train_ratings)*ncol(train_ratings)), nrow = nrow(train_ratings)))

colnames(train_ratings_pred) <- colnames(train_ratings)
rownames(train_ratings_pred) <- rownames(train_ratings)
train_ratings_pred <-round(sweep(train_ratings_pred,MARGIN = 2, STATS = movie_bias, FUN = "+")+user_bias,1)

#clip [.5, 5]
train_ratings_pred[which(train_ratings_pred < .5, arr.ind = TRUE)] <- .5
train_ratings_pred[which(train_ratings_pred > 5, arr.ind = TRUE)] <- 5

train_ratings_pred %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)

	Men in Black (a.k.a. MIB) (1997)	Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)	Saving Private Ryan (1998)	Schindler’s List (1993)	Star Wars: Episode V - The Empire Strikes Back (1980)
18	3.5	4.1	3.9	4.1	4.2
17	4.1	4.7	4.5	4.7	4.9
274	3.4	4.0	3.8	4.0	4.1
354	3.3	3.9	3.6	3.8	4.0
600	3.2	3.8	3.5	3.8	3.9
195	3.2	3.8	3.5	3.8	3.9
570	3.6	4.2	4.0	4.2	4.3

Baseline Prediction for Test Dataset

Calculate user bias and calculate the baseline predictions for the training dataset using the movie bias from the training dataset

user_bias_test <- rowMeans(test_ratings, na.rm = TRUE)-raw_average
test_ratings_pred <- data.frame(matrix(rep(raw_average, nrow(test_ratings)*ncol(test_ratings)), nrow = nrow(test_ratings)))
colnames(test_ratings_pred) <- colnames(test_ratings)
rownames(test_ratings_pred) <- rownames(test_ratings)
test_ratings_pred <-round(sweep(test_ratings_pred,MARGIN = 2, STATS = movie_bias, FUN = "+")+user_bias_test,1)

#clip [.5, 5]
test_ratings_pred[which(test_ratings_pred < .5, arr.ind = TRUE)] <- .5
test_ratings_pred[which(test_ratings_pred > 5, arr.ind = TRUE)] <- 5

test_ratings_pred %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)

	Men in Black (a.k.a. MIB) (1997)	Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)	Saving Private Ryan (1998)	Schindler’s List (1993)	Star Wars: Episode V - The Empire Strikes Back (1980)
282	4.0	4.6	4.4	4.6	4.7
199	3.6	4.2	4.0	4.2	4.3
266	3.8	4.4	4.1	4.3	4.5
45	3.8	4.4	4.1	4.3	4.5
494	4.5	5.0	4.9	5.0	5.0

RMSE Calculation for Baseline Predictor

Calculate the RMSE for both predicted datasets

train_RMSE_baseline <- RMSE_calc(train_ratings, train_ratings_pred)
test_RMSE_baseline <- RMSE_calc(test_ratings, test_ratings_pred)

train_RMSE_baseline

## [1] 0.2710134

test_RMSE_baseline

## [1] 0.5015765

Conclusion

Summarize findings and calculate improvement over RMSE base just on the raw average

raw_avg <- c(raw_average, raw_average)
RMSE <- c(train_RMSE, test_RMSE)
RMSE_b <- c(train_RMSE_baseline, test_RMSE_baseline)
imp <- percent(round(1-(RMSE_b/RMSE),5))

sum_df <- data.frame(raw_avg, RMSE, RMSE_b, imp)
colnames(sum_df) <- c("Raw Average", "RMSE", "RMSE Baseline", "Improvement")
rownames(sum_df) <- c("Train Ratings", "Test Ratings")

sum_df %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)

	Raw Average	RMSE	RMSE Baseline	Improvement
Train Ratings	3.982759	0.4639181	0.2710134	41.58%
Test Ratings	3.982759	0.7862980	0.5015765	36.21%