This system will recommend Books to readers.

Libraries:

library(kableExtra)

1. Reading in the Data.

get_url <- read.csv(file="https://raw.githubusercontent.com/ErindaB/Data-612/master/BookRatings.csv", header=TRUE, sep=",")
Book_R <- get_url[1:5,]
Book_R %>% kable(caption = "Book Ratings") %>% kable_styling("striped", full_width = TRUE)
Book Ratings
Readers Becoming The.Splendid.and.the.Vile Live.Love.Laugh The.Woman.in.the.Window Where.the.Crawdads.Sing The.Story.of.the.Deadliest.Pandemic.in.History Harry_Potter Song_of_Ice_and_Fire Hunger_Games
U001 5 4 5 3 NA 4 NA NA 3
U002 4 5 5 4 5 3 NA 4 3
U003 4 NA 5 NA 2 5 2 NA 3
U004 NA 5 NA 4 2 5 3 1 3
U005 4 5 4 4 3 4 NA 5 3

2. Creating a User Item Matrix and a blank dataframe for testing.

DataMatrix <- Book_R
DataTrain <- DataMatrix
DataTest<-data.frame()[6:10, ]

3. Splitting the data into Test and Train Matrixes.

The Train matrix was created by randomly selecting a position in a matrix and replacing it with “NA” and the Test data set was created by putting those missing values into a new matrix:

set.seed(3)
n <- sample(1:5,ncol(DataMatrix),replace=T)
x <- c(1:ncol(DataMatrix)-1)
for (i in x){
DataTrain[n[i],(i+1)]<-NA
DataTest[n[i],(i+1)]<-DataMatrix[n[i],(i+1)]
}
DataTest <- DataTest[1:5,2:10]
DataTrain <- DataTrain[1:5,2:10]
DataTrain %>% kable(caption = "Train Data Set") %>% kable_styling("striped", full_width = TRUE)
Train Data Set
Becoming The.Splendid.and.the.Vile Live.Love.Laugh The.Woman.in.the.Window Where.the.Crawdads.Sing The.Story.of.the.Deadliest.Pandemic.in.History Harry_Potter Song_of_Ice_and_Fire Hunger_Games
5 4 5 3 NA 4 NA NA 3
4 NA 5 4 NA 3 NA NA 3
4 NA 5 NA 2 NA 2 NA 3
NA 5 NA NA 2 5 NA 1 3
NA 5 4 4 3 4 NA 5 NA
DataTest %>% kable(caption = "Test Data Set") %>% kable_styling("striped", full_width = TRUE)
Test Data Set
V2 V3 V4 V5 V6 V7 V8 V9 V10
NA NA NA NA NA NA NA NA NA NA
NA.1 NA 5 NA NA 5 NA NA 4 NA
NA.2 NA NA NA NA NA 5 NA NA NA
NA.3 NA NA NA 4 NA NA 3 NA NA
NA.4 4 NA NA NA NA NA NA NA 3

4. Calculating the raw average (mean) rating for every user-item combination in my Train data.

#Convert data to numeric
DataTrain<-sapply(DataTrain, as.numeric)
DataTest<-sapply(DataTest, as.numeric)
RawAverage<-mean(DataTrain, na.rm=TRUE)
#Calculating RMSE of the Train set
errortrain <- RawAverage-DataTrain
RMSETrain <- sqrt(mean((errortrain^2), na.rm=TRUE))
round(RMSETrain,2)
## [1] 1.12
#Calculating RMSE of the Test set
errortest <- RawAverage-DataTest
RMSETest <- sqrt(mean((errortest^2), na.rm=TRUE))
round(RMSETest,2)
## [1] 0.89

5. Using training data, let’s calculate the bias for each user and each item.

UserBias <- round(((rowMeans(DataTrain, na.rm=TRUE))-RawAverage),3)
y<-cbind(Book_R,UserBias)
y <- y[-(2:10)]
y %>% kable(caption = "User Bias Calculations") %>% kable_styling("striped", full_width = TRUE)
User Bias Calculations
Readers UserBias
U001 0.296
U002 0.096
U003 -0.504
U004 -0.504
U005 0.463
BookBias <- round(((colMeans(DataTrain, na.rm=TRUE))-RawAverage),3)
BookBias %>% kable(caption = "Book Bias Calculations") %>% kable_styling("striped", full_width = TRUE)
Book Bias Calculations
x
Becoming 0.630
The.Splendid.and.the.Vile 0.963
Live.Love.Laugh 1.046
The.Woman.in.the.Window -0.037
Where.the.Crawdads.Sing -1.370
The.Story.of.the.Deadliest.Pandemic.in.History 0.296
Harry_Potter -1.704
Song_of_Ice_and_Fire -0.704
Hunger_Games -0.704

6. Calculating the baseline predictors for every user-item combination.

#Duplicate user bias to populate a 5x9 matrix 
y<-t(BookBias)
y<-rbind(y,y,y,y,y)
#Duplicate book bias to populate a 5x9 matrix 
z<-cbind(UserBias,UserBias,UserBias,UserBias,UserBias,UserBias,UserBias,UserBias,UserBias)
#Sum both bias matrixes with raw average to calculate Baseline Predictor
BaseLinePred=round((z+y+RawAverage),2)
#Adding Column Names
BookNames <- c("Becoming", "The Splendid and the Vile", "Live Love Laugh", "The Woman in the Window","Where the Crawdads Sing", "The Story of the Deadliest Pandemic in History", "Harry Potter",  "Song of Ice and Fire", "Hunger Games")
colnames(BaseLinePred) <- BookNames
BaseLinePred %>% kable(caption = "Baseline Predictor Calculations") %>% kable_styling("striped", full_width = TRUE)
Baseline Predictor Calculations
Becoming The Splendid and the Vile Live Love Laugh The Woman in the Window Where the Crawdads Sing The Story of the Deadliest Pandemic in History Harry Potter Song of Ice and Fire Hunger Games
4.63 4.96 5.05 3.96 2.63 4.30 2.30 3.30 3.30
4.43 4.76 4.85 3.76 2.43 4.10 2.10 3.10 3.10
3.83 4.16 4.25 3.16 1.83 3.50 1.50 2.50 2.50
3.83 4.16 4.25 3.16 1.83 3.50 1.50 2.50 2.50
4.80 5.13 5.21 4.13 2.80 4.46 2.46 3.46 3.46

7. Calculating the RMSE for the baseline predictors for both training data and test data.

#Calculating RMSE of the Train set
errortrain <- BaseLinePred-DataTrain
RMSETrain <- sqrt(mean((errortrain^2), na.rm=TRUE))
round(RMSETrain,2)
## [1] 0.73
#Calculating RMSE of the Test set
errortest <- BaseLinePred-DataTest
RMSETest <- sqrt(mean((errortest^2), na.rm=TRUE))
round(RMSETest,2)
## [1] 1.3

Summary

As we can see RMSE improved using baseline predictors for the training set, but it worsened for the testing set. This is due to a small data set with very limited information

Predictions

#Adding User Names to Results
Users <- c("U001", "U002", "U003", "U004", "U005")
BaseLinePred<-cbind(Users,BaseLinePred)
BaseLinePred %>% kable(caption = "Predictions") %>% kable_styling("striped", full_width = TRUE)
Predictions
Users Becoming The Splendid and the Vile Live Love Laugh The Woman in the Window Where the Crawdads Sing The Story of the Deadliest Pandemic in History Harry Potter Song of Ice and Fire Hunger Games
U001 4.63 4.96 5.05 3.96 2.63 4.3 2.3 3.3 3.3
U002 4.43 4.76 4.85 3.76 2.43 4.1 2.1 3.1 3.1
U003 3.83 4.16 4.25 3.16 1.83 3.5 1.5 2.5 2.5
U004 3.83 4.16 4.25 3.16 1.83 3.5 1.5 2.5 2.5
U005 4.8 5.13 5.21 4.13 2.8 4.46 2.46 3.46 3.46