Project 1

In this first assignment, I’ll attempt to predict ratings with very little information. I’ll first look at just raw averages across all (training dataset) users. I’ll then account for “bias” by normalizing across users and across items.

Since I am predicting ratings with very little information, I will make my own toy dataset using random sample function from r base. This toy dataset is a collection of ratings of audiobooks by some users. This system will recommend audiobooks to listeners based on their past ratings.

# load libraries
library(tidyverse)
library(kableExtra)
library(knitr)

I use the sample function to create 36 ratings for 6 audiobooks by 6 users into a matrix. Then, I break the ratings into separate training and test datasets and I create random user names and audiobook names and print the user-item matrix. Where each rating may be assigned to a training dataset, assigned to a test dataset, or missing.

# random sample of 36 ratings
set.seed(12)
df <- matrix(sample(1:5, 36, replace = TRUE), nrow = 6)

# sample dataset for splitting
split_df <- sample(1:length(df), 6, replace = FALSE)

# split the data into training dataset
train <- df
train[split_df] <- NA

# split the data into training dataset
test <- df
test[-split_df] <- NA

# create some missing values for both dataset
set.seed(5)
missing_df <- sample(1:length(df), 6, replace = FALSE)
df[missing_df] <- NA
train[missing_df] <- NA
test[missing_df] <- NA

# name of the books
users <- c("Saayed Alam", "Bill Gates", "Humera Ferdous", "Kevin Rose", "Cody Rodes", "Jon Moxley")
rownames(df) <- users
rownames(train) <- users
rownames(test) <- users

# name of the users
audiobook <- c("Caesar's Last Breath", "Why Buddhism is True", "The Power of Now", "Leonardo da Vinci", "Factfulness", "The Order of Time")
colnames(df) <- audiobook
colnames(train) <- audiobook
colnames(test) <- audiobook

# print the matrix
df %>% kable(caption = "Audiobook Ratings") %>% kable_styling("striped", full_width = TRUE)

Audiobook Ratings
	Caesar’s Last Breath	Why Buddhism is True	The Power of Now	Leonardo da Vinci	Factfulness	The Order of Time
Saayed Alam	2	NA	4	2	1	4
Bill Gates	NA	3	2	3	2	4
Humera Ferdous	3	2	NA	NA	2	5
Kevin Rose	5	5	5	4	4	5
Cody Rodes	5	NA	4	5	4	3
Jon Moxley	4	1	1	5	NA	5

train %>% kable(caption = "Training Dataset") %>% kable_styling("striped", full_width = TRUE)

Training Dataset
	Caesar’s Last Breath	Why Buddhism is True	The Power of Now	Leonardo da Vinci	Factfulness	The Order of Time
Saayed Alam	2	NA	NA	2	1	4
Bill Gates	NA	3	2	NA	2	4
Humera Ferdous	3	2	NA	NA	NA	5
Kevin Rose	5	5	5	4	4	NA
Cody Rodes	5	NA	4	NA	4	3
Jon Moxley	4	1	1	5	NA	5

test %>% kable(caption = "Test Dataset") %>% kable_styling("striped", full_width = TRUE)

Test Dataset
	Caesar’s Last Breath	Why Buddhism is True	The Power of Now	Leonardo da Vinci	Factfulness	The Order of Time
Saayed Alam	NA	NA	4	NA	NA	NA
Bill Gates	NA	NA	NA	3	NA	NA
Humera Ferdous	NA	NA	NA	NA	2	NA
Kevin Rose	NA	NA	NA	NA	NA	5
Cody Rodes	NA	NA	NA	5	NA	NA
Jon Moxley	NA	NA	NA	NA	NA	NA

Then using the training dataset, I calculate the raw average (mean) rating for every user-item combination and calculate the RMSE for raw average for both your training data and your test data.

# raw average
raw_ave <- round(mean(train, na.rm = TRUE), 2)

# user-item matrix for raw avearge
user_item <- matrix(replicate(36, raw_ave), 6)
rownames(user_item) <- rownames(train)
colnames(user_item) <- colnames(train)
user_item %>% kable(caption = "User-Item Matrix") %>% kable_styling("striped", full_width = TRUE)

User-Item Matrix
	Caesar’s Last Breath	Why Buddhism is True	The Power of Now	Leonardo da Vinci	Factfulness	The Order of Time
Saayed Alam	3.4	3.4	3.4	3.4	3.4	3.4
Bill Gates	3.4	3.4	3.4	3.4	3.4	3.4
Humera Ferdous	3.4	3.4	3.4	3.4	3.4	3.4
Kevin Rose	3.4	3.4	3.4	3.4	3.4	3.4
Cody Rodes	3.4	3.4	3.4	3.4	3.4	3.4
Jon Moxley	3.4	3.4	3.4	3.4	3.4	3.4

# function to calculate RMSE
rmse <- function(o, p) {
  round((sqrt(mean((o - p)^2, na.rm = TRUE))), 2)
}

# rmse for train dataset
rmse1 <- rmse(train, raw_ave)

# rmse for test dataset
rmse2 <- rmse(test, raw_ave)

Further, I calculate the bias for each user and each item using your training data. Then from the raw average, and the appropriate user and item biases, I calculate the baseline predictors for every user-item combination.

# bias for each user
user_bias <- round((rowMeans(train, na.rm = TRUE) - raw_ave), 2)
user_bias %>% kable(col.names = "User Bias") %>% kable_styling("striped", full_width = TRUE)

	User Bias
Saayed Alam	-1.15
Bill Gates	-0.65
Humera Ferdous	-0.07
Kevin Rose	1.20
Cody Rodes	0.60
Jon Moxley	-0.20

# bias for each item
item_bias <- round((colMeans(train, na.rm = TRUE) - raw_ave), 2)
item_bias %>% kable(col.names = "Item Bias") %>% kable_styling("striped", full_width = TRUE)

	Item Bias
Caesar’s Last Breath	0.40
Why Buddhism is True	-0.65
The Power of Now	-0.40
Leonardo da Vinci	0.27
Factfulness	-0.65
The Order of Time	0.80

# calculate every user-item biases combination
a <- apply(expand.grid((as_tibble(user_bias))[[1]], (as_tibble(item_bias))[[1]]), 1, sum)

# baseline predictors for every user-item combination
baseline <- (replicate(36, raw_ave) + a)
baseline <- matrix(baseline, 6)
rownames(baseline) <- rownames(train)
colnames(baseline) <- colnames(train)
baseline %>% kable(caption = "Baeline Predictor") %>% kable_styling("striped", full_width = TRUE)

Baeline Predictor
	Caesar’s Last Breath	Why Buddhism is True	The Power of Now	Leonardo da Vinci	Factfulness	The Order of Time
Saayed Alam	2.65	1.60	1.85	2.52	1.60	3.05
Bill Gates	3.15	2.10	2.35	3.02	2.10	3.55
Humera Ferdous	3.73	2.68	2.93	3.60	2.68	4.13
Kevin Rose	5.00	3.95	4.20	4.87	3.95	5.40
Cody Rodes	4.40	3.35	3.60	4.27	3.35	4.80
Jon Moxley	3.60	2.55	2.80	3.47	2.55	4.00

Finally, I calculate the RMSE for the baseline predictors for both your training data and your test data and summarize the results in a table below.

# rmse for baseline predictors
rmse3 <- rmse(test, baseline)
rmse4 <- rmse(train, baseline)

# summary of the result
kable(cbind(rmse1, rmse2, rmse3, rmse4), col.names = rep(c("Train", "Test"), 2)) %>% 
  kable_styling("striped", full_width = F) %>% 
  add_header_above(c("Raw Average" = 2, "Baseline Predictor" = 2))

Raw Average		Baseline Predictor
Train	Test	Train	Test
1.39	1.23	1.08	0.91

Project 1

Saayed Alam

5/31/2019