This system will recommend Books to readers.
Libraries:
library(kableExtra)
1. Reading in the Data.
get_url <- read.csv(file="https://raw.githubusercontent.com/ErindaB/Data-612/master/BookRatings.csv", header=TRUE, sep=",")
Book_R <- get_url[1:5,]
Book_R %>% kable(caption = "Book Ratings") %>% kable_styling("striped", full_width = TRUE)
Book Ratings
|
Readers
|
Becoming
|
The.Splendid.and.the.Vile
|
Live.Love.Laugh
|
The.Woman.in.the.Window
|
Where.the.Crawdads.Sing
|
The.Story.of.the.Deadliest.Pandemic.in.History
|
Harry_Potter
|
Song_of_Ice_and_Fire
|
Hunger_Games
|
|
U001
|
5
|
4
|
5
|
3
|
NA
|
4
|
NA
|
NA
|
3
|
|
U002
|
4
|
5
|
5
|
4
|
5
|
3
|
NA
|
4
|
3
|
|
U003
|
4
|
NA
|
5
|
NA
|
2
|
5
|
2
|
NA
|
3
|
|
U004
|
NA
|
5
|
NA
|
4
|
2
|
5
|
3
|
1
|
3
|
|
U005
|
4
|
5
|
4
|
4
|
3
|
4
|
NA
|
5
|
3
|
2. Creating a User Item Matrix and a blank dataframe for testing.
DataMatrix <- Book_R
DataTrain <- DataMatrix
DataTest<-data.frame()[6:10, ]
3. Splitting the data into Test and Train Matrixes.
The Train matrix was created by randomly selecting a position in a matrix and replacing it with “NA” and the Test data set was created by putting those missing values into a new matrix:
set.seed(3)
n <- sample(1:5,ncol(DataMatrix),replace=T)
x <- c(1:ncol(DataMatrix)-1)
for (i in x){
DataTrain[n[i],(i+1)]<-NA
DataTest[n[i],(i+1)]<-DataMatrix[n[i],(i+1)]
}
DataTest <- DataTest[1:5,2:10]
DataTrain <- DataTrain[1:5,2:10]
DataTrain %>% kable(caption = "Train Data Set") %>% kable_styling("striped", full_width = TRUE)
Train Data Set
|
Becoming
|
The.Splendid.and.the.Vile
|
Live.Love.Laugh
|
The.Woman.in.the.Window
|
Where.the.Crawdads.Sing
|
The.Story.of.the.Deadliest.Pandemic.in.History
|
Harry_Potter
|
Song_of_Ice_and_Fire
|
Hunger_Games
|
|
5
|
4
|
5
|
3
|
NA
|
4
|
NA
|
NA
|
3
|
|
4
|
NA
|
5
|
4
|
NA
|
3
|
NA
|
NA
|
3
|
|
4
|
NA
|
5
|
NA
|
2
|
NA
|
2
|
NA
|
3
|
|
NA
|
5
|
NA
|
NA
|
2
|
5
|
NA
|
1
|
3
|
|
NA
|
5
|
4
|
4
|
3
|
4
|
NA
|
5
|
NA
|
DataTest %>% kable(caption = "Test Data Set") %>% kable_styling("striped", full_width = TRUE)
Test Data Set
|
|
V2
|
V3
|
V4
|
V5
|
V6
|
V7
|
V8
|
V9
|
V10
|
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
NA.1
|
NA
|
5
|
NA
|
NA
|
5
|
NA
|
NA
|
4
|
NA
|
|
NA.2
|
NA
|
NA
|
NA
|
NA
|
NA
|
5
|
NA
|
NA
|
NA
|
|
NA.3
|
NA
|
NA
|
NA
|
4
|
NA
|
NA
|
3
|
NA
|
NA
|
|
NA.4
|
4
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
3
|
4. Calculating the raw average (mean) rating for every user-item combination in my Train data.
#Convert data to numeric
DataTrain<-sapply(DataTrain, as.numeric)
DataTest<-sapply(DataTest, as.numeric)
RawAverage<-mean(DataTrain, na.rm=TRUE)
#Calculating RMSE of the Train set
errortrain <- RawAverage-DataTrain
RMSETrain <- sqrt(mean((errortrain^2), na.rm=TRUE))
round(RMSETrain,2)
## [1] 1.12
#Calculating RMSE of the Test set
errortest <- RawAverage-DataTest
RMSETest <- sqrt(mean((errortest^2), na.rm=TRUE))
round(RMSETest,2)
## [1] 0.89
5. Using training data, let’s calculate the bias for each user and each item.
UserBias <- round(((rowMeans(DataTrain, na.rm=TRUE))-RawAverage),3)
y<-cbind(Book_R,UserBias)
y <- y[-(2:10)]
y %>% kable(caption = "User Bias Calculations") %>% kable_styling("striped", full_width = TRUE)
User Bias Calculations
|
Readers
|
UserBias
|
|
U001
|
0.296
|
|
U002
|
0.096
|
|
U003
|
-0.504
|
|
U004
|
-0.504
|
|
U005
|
0.463
|
BookBias <- round(((colMeans(DataTrain, na.rm=TRUE))-RawAverage),3)
BookBias %>% kable(caption = "Book Bias Calculations") %>% kable_styling("striped", full_width = TRUE)
Book Bias Calculations
|
|
x
|
|
Becoming
|
0.630
|
|
The.Splendid.and.the.Vile
|
0.963
|
|
Live.Love.Laugh
|
1.046
|
|
The.Woman.in.the.Window
|
-0.037
|
|
Where.the.Crawdads.Sing
|
-1.370
|
|
The.Story.of.the.Deadliest.Pandemic.in.History
|
0.296
|
|
Harry_Potter
|
-1.704
|
|
Song_of_Ice_and_Fire
|
-0.704
|
|
Hunger_Games
|
-0.704
|
6. Calculating the baseline predictors for every user-item combination.
#Duplicate user bias to populate a 5x9 matrix
y<-t(BookBias)
y<-rbind(y,y,y,y,y)
#Duplicate book bias to populate a 5x9 matrix
z<-cbind(UserBias,UserBias,UserBias,UserBias,UserBias,UserBias,UserBias,UserBias,UserBias)
#Sum both bias matrixes with raw average to calculate Baseline Predictor
BaseLinePred=round((z+y+RawAverage),2)
#Adding Column Names
BookNames <- c("Becoming", "The Splendid and the Vile", "Live Love Laugh", "The Woman in the Window","Where the Crawdads Sing", "The Story of the Deadliest Pandemic in History", "Harry Potter", "Song of Ice and Fire", "Hunger Games")
colnames(BaseLinePred) <- BookNames
BaseLinePred %>% kable(caption = "Baseline Predictor Calculations") %>% kable_styling("striped", full_width = TRUE)
Baseline Predictor Calculations
|
Becoming
|
The Splendid and the Vile
|
Live Love Laugh
|
The Woman in the Window
|
Where the Crawdads Sing
|
The Story of the Deadliest Pandemic in History
|
Harry Potter
|
Song of Ice and Fire
|
Hunger Games
|
|
4.63
|
4.96
|
5.05
|
3.96
|
2.63
|
4.30
|
2.30
|
3.30
|
3.30
|
|
4.43
|
4.76
|
4.85
|
3.76
|
2.43
|
4.10
|
2.10
|
3.10
|
3.10
|
|
3.83
|
4.16
|
4.25
|
3.16
|
1.83
|
3.50
|
1.50
|
2.50
|
2.50
|
|
3.83
|
4.16
|
4.25
|
3.16
|
1.83
|
3.50
|
1.50
|
2.50
|
2.50
|
|
4.80
|
5.13
|
5.21
|
4.13
|
2.80
|
4.46
|
2.46
|
3.46
|
3.46
|
7. Calculating the RMSE for the baseline predictors for both training data and test data.
#Calculating RMSE of the Train set
errortrain <- BaseLinePred-DataTrain
RMSETrain <- sqrt(mean((errortrain^2), na.rm=TRUE))
round(RMSETrain,2)
## [1] 0.73
#Calculating RMSE of the Test set
errortest <- BaseLinePred-DataTest
RMSETest <- sqrt(mean((errortest^2), na.rm=TRUE))
round(RMSETest,2)
## [1] 1.3
Summary
As we can see RMSE improved using baseline predictors for the training set, but it worsened for the testing set. This is due to a small data set with very limited information
Predictions
#Adding User Names to Results
Users <- c("U001", "U002", "U003", "U004", "U005")
BaseLinePred<-cbind(Users,BaseLinePred)
BaseLinePred %>% kable(caption = "Predictions") %>% kable_styling("striped", full_width = TRUE)
Predictions
|
Users
|
Becoming
|
The Splendid and the Vile
|
Live Love Laugh
|
The Woman in the Window
|
Where the Crawdads Sing
|
The Story of the Deadliest Pandemic in History
|
Harry Potter
|
Song of Ice and Fire
|
Hunger Games
|
|
U001
|
4.63
|
4.96
|
5.05
|
3.96
|
2.63
|
4.3
|
2.3
|
3.3
|
3.3
|
|
U002
|
4.43
|
4.76
|
4.85
|
3.76
|
2.43
|
4.1
|
2.1
|
3.1
|
3.1
|
|
U003
|
3.83
|
4.16
|
4.25
|
3.16
|
1.83
|
3.5
|
1.5
|
2.5
|
2.5
|
|
U004
|
3.83
|
4.16
|
4.25
|
3.16
|
1.83
|
3.5
|
1.5
|
2.5
|
2.5
|
|
U005
|
4.8
|
5.13
|
5.21
|
4.13
|
2.8
|
4.46
|
2.46
|
3.46
|
3.46
|