DATA 612 Project 1 | Global Baseline Predictors and RMSE

Description
Dataset
Load Data
Training and Test datasets
Calculations
Summarizing results

Description

User-Book Rating Recommender System

This is a recommender system with 20 users and their respective ratings on 5 books. It recommends books to users based on other user ratings.

Dataset

Step-1:

I made a toy dataset, which has 20 users as rows and 5 Books as columns with numeric values of ratings ranging from 1 to 5

Load Data

Step-2:

Load the necessary libraries

library(pander)
library(ggplot2)
library(knitr)
library(dplyr)
library(reshape2)

Step-3:

Load the User_Books dataset and create a user-item matrix

# load csv into data variable
data <- read.csv("https://raw.githubusercontent.com/PriyaShaji/Data612/master/Project_1/User_Books.csv", 
    row.names = 1)

# convert data into a matrix
data <- as.matrix(data)
pander(data)

	book1	book2	book3	book4	book5
A	3	NA	4	4	3
B	4	5	3	NA	2
C	NA	3	3	3	3
D	5	5	5	5	NA
E	2	3	3	NA	4
F	3	4	4	3	3
G	2	2	NA	2	2
H	3	NA	4	4	4
I	2	3	3	NA	4
J	3	NA	3	3	3
K	1	1	1	NA	1
L	NA	3	4	3	4
M	3	NA	4	4	3
N	4	4	3	NA	2
O	NA	3	4	4	4
P	5	5	NA	5	5
Q	2	NA	3	3	4
R	3	NA	3	3	3
S	2	2	2	NA	2
T	3	NA	4	4	4

Training and Test datasets

Step-4:

Break your ratings into separate training and test datasets.

Lets split the User_Books dataset into two. Training and Test. I selected 12 reviews from training. I will replace those with NA in the training set. NA was used so it would be omitted from our calculations. In the test dataset I only kept values identified for testing. the others were replaced with NA.

test_rows <- c(1, 3, 4, 5, 6, 7, 14, 13, 19, 20, 12, 14)
test_cols <- c(1, 4, 2, 3, 4, 5, 2, 3, 3, 4, 5, 3)
test_indices <- cbind(test_rows, test_cols)

data_train <- data
data_train[test_indices] <- NA

data_test <- data
data_test[test_indices] <- 0
data_test[data_test > 0] <- NA
data_test[test_indices] <- data[test_indices]

Train Dataset

data_train

  book1 book2 book3 book4 book5
A    NA    NA     4     4     3
B     4     5     3    NA     2
C    NA     3     3    NA     3
D     5    NA     5     5    NA
E     2     3    NA    NA     4
F     3     4     4    NA     3
G     2     2    NA     2    NA
H     3    NA     4     4     4
I     2     3     3    NA     4
J     3    NA     3     3     3
K     1     1     1    NA     1
L    NA     3     4     3    NA
M     3    NA    NA     4     3
N     4    NA    NA    NA     2
O    NA     3     4     4     4
 [ reached getOption("max.print") -- omitted 5 rows ]

Test Dataset

data_test

  book1 book2 book3 book4 book5
A     3    NA    NA    NA    NA
B    NA    NA    NA    NA    NA
C    NA    NA    NA     3    NA
D    NA     5    NA    NA    NA
E    NA    NA     3    NA    NA
F    NA    NA    NA     3    NA
G    NA    NA    NA    NA     2
H    NA    NA    NA    NA    NA
I    NA    NA    NA    NA    NA
J    NA    NA    NA    NA    NA
K    NA    NA    NA    NA    NA
L    NA    NA    NA    NA     4
M    NA    NA     4    NA    NA
N    NA     4     3    NA    NA
O    NA    NA    NA    NA    NA
 [ reached getOption("max.print") -- omitted 5 rows ]

Calculations

Using training data, calculate the raw average (mean) rating for every user-item combination.

This function computes the raw average of the user-item matrix

Mean rating for each user in the User_Books train dataset

user_means <- rowMeans(data_train, na.rm = TRUE)
user_means_df <- data.frame(as.list(user_means))
# change user means from wide to long
user_means_df <- tidyr::gather(user_means_df, "user")
p1 <- ggplot(user_means_df, aes(x = user, y = value, fill = user)) + geom_bar(stat = "identity") + 
    labs(title = "Plot of Mean User ratings", x = "User", y = "Avg. Rating")
colnames(user_means_df) <- c("User", "Rating")
pander(user_means)

Table continues below
A	B	C	D	E	F	G	H	I	J	K	L	M	N
3.667	3.5	3	5	3	3.5	2	3.75	3	3	1	3.333	3.333	3

O	P	Q	R	S	T
3.75	5	3	3	2	3.667

p1

Mean rating for each book in the User_Books train dataset.

book_means <- colMeans(data_train, na.rm = TRUE)
book_means_df <- data.frame(as.list(book_means))
# change user means from wide to long
book_means_df <- tidyr::gather(book_means_df, "book")
p2 <- ggplot(book_means_df, aes(x = book, y = value, fill = book)) + geom_bar(stat = "identity") + 
    labs(title = "Plot of Book Average Rating", x = "Book", y = "Avg. Rating")
colnames(book_means_df) <- c("Book", "Rating")
pander(book_means)

book1	book2	book3	book4	book5
2.938	3.091	3.429	3.636	3.176

p2

Calculate the RMSE for raw average for both your training data and your test data.

Rating for every user-item combination, for Test and Train data sets

raw_test <- mean(data_test, na.rm = TRUE)
raw_test_mat <- data_test
raw_test_mat[] <- raw_test
raw_test

[1] 3.333333

raw_train_mat <- data_train
raw_train <- mean(data_train, na.rm = TRUE)
raw_train_mat[] <- raw_train
raw_train

[1] 3.231884

RMSE for Test and Train data sets

# find squre difference
squareDiff_train <- (data_train - raw_train_mat)^2
# find mean of squareDiff
squareDiff_train_mean <- mean(squareDiff_train, na.rm = TRUE)
# find square root
rmse_train <- sqrt(squareDiff_train_mean)
# train test
squareDiff_test <- (data_test - raw_test_mat)^2
# find mean of squareDiff
squareDiff_test_mean <- mean(squareDiff_test, na.rm = TRUE)
# find square root
rmse_test <- sqrt(squareDiff_test_mean)

RMSE for train dataset

rmse_train

[1] 1.037624

RMSE for test dataset

rmse_test

[1] 0.8498366

Using your training data, calculate the bias for each user and each item.

User Bias

## user bias
user_bias <- user_means - raw_train
user_bias_df <- data.frame(as.list(user_bias))
user_bias_df <- tidyr::gather(user_bias_df, "user")
colnames(user_bias_df) <- c("User", "Bias")
pander(user_bias_df)

User	Bias
A	0.4348
B	0.2681
C	-0.2319
D	1.768
E	-0.2319
F	0.2681
G	-1.232
H	0.5181
I	-0.2319
J	-0.2319
K	-2.232
L	0.1014
M	0.1014
N	-0.2319
O	0.5181
P	1.768
Q	-0.2319
R	-0.2319
S	-1.232
T	0.4348

Book Bias

# book bias
book_bias <- book_means - raw_train
book_bias_df <- data.frame(as.list(book_bias))
book_bias_df <- tidyr::gather(book_bias_df, "book")
colnames(book_bias_df) <- c("Book", "Bias")
pander(book_bias_df)

Book	Bias
book1	-0.2944
book2	-0.141
book3	0.1967
book4	0.4045
book5	-0.05541

From the raw average, and the appropriate user and item biases, calculate the baseline predictors for every user-item combination.

# raw average + user bias + book bias
calBaseLine <- function(in_matrix, book_bias_in, user_bias_in, raw_average) {
    out_matrix <- in_matrix
    row_count <- 1
    for (item in 1:nrow(in_matrix)) {
        col_count <- 1
        for (colItem in 1:ncol(in_matrix)) {
            # out_matrix[row_count,col_count] <- 0
            out_matrix[row_count, col_count] <- raw_average[1] + user_bias_in[[row_count]] + 
                book_bias_in[[col_count]]
            col_count <- col_count + 1
        }
        row_count <- row_count + 1
    }
    return(out_matrix)
}
base_pred <- calBaseLine(data_train, book_bias, user_bias, raw_train)
pander(base_pred)

	book1	book2	book3	book4	book5
A	3.372	3.526	3.863	4.071	3.611
B	3.206	3.359	3.697	3.904	3.445
C	2.706	2.859	3.197	3.404	2.945
D	4.706	4.859	5.197	5.404	4.945
E	2.706	2.859	3.197	3.404	2.945
F	3.206	3.359	3.697	3.904	3.445
G	1.706	1.859	2.197	2.404	1.945
H	3.456	3.609	3.947	4.154	3.695
I	2.706	2.859	3.197	3.404	2.945
J	2.706	2.859	3.197	3.404	2.945
K	0.7056	0.859	1.197	1.404	0.9446
L	3.039	3.192	3.53	3.738	3.278
M	3.039	3.192	3.53	3.738	3.278
N	2.706	2.859	3.197	3.404	2.945
O	3.456	3.609	3.947	4.154	3.695
P	4.706	4.859	5.197	5.404	4.945
Q	2.706	2.859	3.197	3.404	2.945
R	2.706	2.859	3.197	3.404	2.945
S	1.706	1.859	2.197	2.404	1.945
T	3.372	3.526	3.863	4.071	3.611

Calculate the RMSE for the baseline predictors for both your training data and your test data.

## test data finding Error
data_err <- data_test - base_pred
# squaring error
data_err <- (data_err)^2
# finding average
data_rmse_test <- mean(data_err[test_indices])
# square root
data_rmse_test <- sqrt(data_rmse_test)
## training data finding Error
data_err_train <- data_train - base_pred
# squaring error
data_err_train <- (data_err_train)^2
# finding average
data_rmse_train <- mean(data_err_train, na.rm = TRUE)
# square root
data_rmse_train <- sqrt(data_rmse_train)

RMSE for test data

data_rmse_test

[1] 0.5250754

RMSE for train data

data_rmse_train

[1] 0.518253

Summarizing results

Lets calculate the percentage improvements based on the original (simple average) and baseline predictor (including bias) RMSE numbers for both Test and Train data sets.

The results show that we see a 50% improvement in making a prediction for the ratings in the Training data set. Where as we see only 38% improvement in prediction for the Test data set. Both are positive however the Training data set yielded better prediction.

# Train data set
R1 <- rmse_train
R1_data <- data_rmse_train
Prediction_Improv_Train <- (1 - (R1_data/R1)) * 100
Prediction_Improv_Train

[1] 50.0539

# Test data set
R2 <- rmse_test
R2_data <- data_rmse_test
Prediction_Improv_Test <- (1 - (R2_data/R2)) * 100
Prediction_Improv_Test

[1] 38.21455

	book1	book2	book3	book4	book5
A	3	NA	4	4	3
B	4	5	3	NA	2
C	NA	3	3	3	3
D	5	5	5	5	NA
E	2	3	3	NA	4
F	3	4	4	3	3
G	2	2	NA	2	2
H	3	NA	4	4	4
I	2	3	3	NA	4
J	3	NA	3	3	3
K	1	1	1	NA	1
L	NA	3	4	3	4
M	3	NA	4	4	3
N	4	4	3	NA	2
O	NA	3	4	4	4
P	5	5	NA	5	5
Q	2	NA	3	3	4
R	3	NA	3	3	3
S	2	2	2	NA	2
T	3	NA	4	4	4

	book1	book2	book3	book4	book5
A	3	NA	4	4	3
B	4	5	3	NA	2
C	NA	3	3	3	3
D	5	5	5	5	NA
E	2	3	3	NA	4
F	3	4	4	3	3
G	2	2	NA	2	2
H	3	NA	4	4	4
I	2	3	3	NA	4
J	3	NA	3	3	3
K	1	1	1	NA	1
L	NA	3	4	3	4
M	3	NA	4	4	3
N	4	4	3	NA	2
O	NA	3	4	4	4
P	5	5	NA	5	5
Q	2	NA	3	3	4
R	3	NA	3	3	3
S	2	2	2	NA	2
T	3	NA	4	4	4

	book1	book2	book3	book4	book5
A	3	NA	4	4	3
B	4	5	3	NA	2
C	NA	3	3	3	3
D	5	5	5	5	NA
E	2	3	3	NA	4
F	3	4	4	3	3
G	2	2	NA	2	2
H	3	NA	4	4	4
I	2	3	3	NA	4
J	3	NA	3	3	3
K	1	1	1	NA	1
L	NA	3	4	3	4
M	3	NA	4	4	3
N	4	4	3	NA	2
O	NA	3	4	4	4
P	5	5	NA	5	5
Q	2	NA	3	3	4
R	3	NA	3	3	3
S	2	2	2	NA	2
T	3	NA	4	4	4