Summarize your results.
Built a sample dataset with Users and Books, 10 on each and randomly assigned the values with some missing values NA
# random sample of 100 ratings
set.seed(612)
df <- matrix(sample(1:5, 100, replace = TRUE), nrow = 10)
# sample dataset for splitting
split_df <- sample(1:length(df), 10, replace = FALSE)
# split the data into train_dfing dataset
train_df <- df
train_df[split_df] <- NA
# split the data into train_dfing dataset
test_df <- df
test_df[-split_df] <- NA
# create some missing values for both dataset
set.seed(612)
missing_df <- sample(1:length(df), 10, replace = FALSE)
df[missing_df] <- NA
train_df[missing_df] <- NA
test_df[missing_df] <- NA
# name of the books
users <- c("User_1","User_2","User_3","User_4","User_5","User_6","User_7","User_8","User_9","User_10")
rownames(df) <- users
rownames(train_df) <- users
rownames(test_df) <- users
# name of the users
colname <- c("Book_1","Book_2","Book_3","Book_4","Book_5","Book_6","Book_7","Book_8","Book_9","Book_10")
colnames(df) <- colname
colnames(train_df) <- colname
colnames(test_df) <- colname
# print the matrix
kable(df,caption = "User-Book Ratings") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
scroll_box(width = "100%", height = "300px")| Book_1 | Book_2 | Book_3 | Book_4 | Book_5 | Book_6 | Book_7 | Book_8 | Book_9 | Book_10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| User_1 | 4 | 1 | 5 | 4 | 3 | 1 | 5 | 4 | 2 | 1 |
| User_2 | 3 | 5 | 4 | 1 | NA | 1 | 3 | 1 | 3 | 5 |
| User_3 | 2 | 4 | 1 | 2 | 4 | 3 | 1 | 3 | 5 | 5 |
| User_4 | 1 | 4 | 4 | 3 | 1 | 1 | 5 | 4 | 5 | 5 |
| User_5 | 1 | 4 | NA | 1 | 4 | 4 | 3 | 4 | 2 | 5 |
| User_6 | 3 | NA | 1 | 3 | 3 | 4 | NA | 4 | 2 | 5 |
| User_7 | 3 | 2 | NA | 2 | 1 | 2 | 4 | 4 | 3 | 5 |
| User_8 | 2 | NA | 4 | 2 | NA | 5 | 2 | 1 | 2 | 1 |
| User_9 | 5 | 4 | 5 | 5 | 1 | 4 | 4 | 1 | NA | 5 |
| User_10 | 2 | 1 | NA | 3 | NA | 4 | 5 | 5 | 3 | 3 |
Building a training dataset
kable(train_df,caption = "train_dfing Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
scroll_box(width = "100%", height = "300px")| Book_1 | Book_2 | Book_3 | Book_4 | Book_5 | Book_6 | Book_7 | Book_8 | Book_9 | Book_10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| User_1 | 4 | 1 | 5 | 4 | NA | 1 | 5 | 4 | 2 | 1 |
| User_2 | 3 | 5 | 4 | 1 | NA | NA | 3 | NA | 3 | 5 |
| User_3 | 2 | 4 | 1 | 2 | 4 | 3 | 1 | 3 | 5 | 5 |
| User_4 | 1 | 4 | 4 | 3 | 1 | 1 | 5 | 4 | 5 | 5 |
| User_5 | 1 | 4 | NA | 1 | NA | 4 | 3 | 4 | 2 | NA |
| User_6 | 3 | NA | NA | 3 | 3 | 4 | NA | 4 | 2 | 5 |
| User_7 | 3 | 2 | NA | 2 | 1 | 2 | 4 | 4 | 3 | 5 |
| User_8 | NA | NA | 4 | 2 | NA | 5 | 2 | 1 | 2 | 1 |
| User_9 | 5 | 4 | 5 | 5 | 1 | 4 | 4 | 1 | NA | 5 |
| User_10 | 2 | 1 | NA | 3 | NA | 4 | 5 | NA | 3 | NA |
Builsing a test dataset
kable(test_df,caption = "test_df Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
scroll_box(width = "100%", height = "300px")| Book_1 | Book_2 | Book_3 | Book_4 | Book_5 | Book_6 | Book_7 | Book_8 | Book_9 | Book_10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| User_1 | NA | NA | NA | NA | 3 | NA | NA | NA | NA | NA |
| User_2 | NA | NA | NA | NA | NA | 1 | NA | 1 | NA | NA |
| User_3 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| User_4 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| User_5 | NA | NA | NA | NA | 4 | NA | NA | NA | NA | 5 |
| User_6 | NA | NA | 1 | NA | NA | NA | NA | NA | NA | NA |
| User_7 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| User_8 | 2 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| User_9 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| User_10 | NA | NA | NA | NA | NA | NA | NA | 5 | NA | 3 |
Used replicate function to create an user-item matrix and finding the average
# raw average
raw_avg <- round(mean(train_df, na.rm = TRUE), 2)
# user-item matrix for raw avearge
user_item <- matrix(replicate(100, raw_avg), 10)
rownames(user_item) <- rownames(train_df)
colnames(user_item) <- colnames(train_df)
kable(user_item,caption = "User-Item Matrix") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
scroll_box(width = "100%", height = "300px")| Book_1 | Book_2 | Book_3 | Book_4 | Book_5 | Book_6 | Book_7 | Book_8 | Book_9 | Book_10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| User_1 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
| User_2 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
| User_3 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
| User_4 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
| User_5 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
| User_6 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
| User_7 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
| User_8 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
| User_9 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
| User_10 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 | 3.11 |
Calculating bias for each user using rowMeans function
# bias for each user
user_bias <- round((rowMeans(train_df, na.rm = TRUE) - raw_avg), 2)
kable(user_bias,caption = "User Bias") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
scroll_box(width = "100%", height = "300px")| x | |
|---|---|
| User_1 | -0.11 |
| User_2 | 0.32 |
| User_3 | -0.11 |
| User_4 | 0.19 |
| User_5 | -0.40 |
| User_6 | 0.32 |
| User_7 | -0.22 |
| User_8 | -0.68 |
| User_9 | 0.67 |
| User_10 | -0.11 |
Calculating bias for each item using colMeans function
# bias for each item
item_bias <- round((colMeans(train_df, na.rm = TRUE) - raw_avg), 2)
kable(item_bias,caption = "Item Bias") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
scroll_box(width = "100%", height = "300px")| x | |
|---|---|
| Book_1 | -0.44 |
| Book_2 | 0.02 |
| Book_3 | 0.72 |
| Book_4 | -0.51 |
| Book_5 | -1.11 |
| Book_6 | 0.00 |
| Book_7 | 0.45 |
| Book_8 | 0.02 |
| Book_9 | -0.11 |
| Book_10 | 0.89 |
Calculating baseline predictors for every user-item combination
# calculate every user-item biases combination
com <- apply(expand.grid((as_tibble(user_bias))[[1]], (as_tibble(item_bias))[[1]]), 1, sum)
# baseline predictors for every user-item combination
baseline <- (replicate(100, raw_avg) + com)
baseline <- matrix(baseline, 10)
rownames(baseline) <- rownames(train_df)
colnames(baseline) <- colnames(train_df)
kable(baseline,caption = "Item Bias") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
scroll_box(width = "100%", height = "300px")| Book_1 | Book_2 | Book_3 | Book_4 | Book_5 | Book_6 | Book_7 | Book_8 | Book_9 | Book_10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| User_1 | 2.56 | 3.02 | 3.72 | 2.49 | 1.89 | 3.00 | 3.45 | 3.02 | 2.89 | 3.89 |
| User_2 | 2.99 | 3.45 | 4.15 | 2.92 | 2.32 | 3.43 | 3.88 | 3.45 | 3.32 | 4.32 |
| User_3 | 2.56 | 3.02 | 3.72 | 2.49 | 1.89 | 3.00 | 3.45 | 3.02 | 2.89 | 3.89 |
| User_4 | 2.86 | 3.32 | 4.02 | 2.79 | 2.19 | 3.30 | 3.75 | 3.32 | 3.19 | 4.19 |
| User_5 | 2.27 | 2.73 | 3.43 | 2.20 | 1.60 | 2.71 | 3.16 | 2.73 | 2.60 | 3.60 |
| User_6 | 2.99 | 3.45 | 4.15 | 2.92 | 2.32 | 3.43 | 3.88 | 3.45 | 3.32 | 4.32 |
| User_7 | 2.45 | 2.91 | 3.61 | 2.38 | 1.78 | 2.89 | 3.34 | 2.91 | 2.78 | 3.78 |
| User_8 | 1.99 | 2.45 | 3.15 | 1.92 | 1.32 | 2.43 | 2.88 | 2.45 | 2.32 | 3.32 |
| User_9 | 3.34 | 3.80 | 4.50 | 3.27 | 2.67 | 3.78 | 4.23 | 3.80 | 3.67 | 4.67 |
| User_10 | 2.56 | 3.02 | 3.72 | 2.49 | 1.89 | 3.00 | 3.45 | 3.02 | 2.89 | 3.89 |
Calculating RMSE for baseline predictors for training and testing data
round((sqrt(mean((x - y)^2, na.rm = TRUE))), 2)
# function to calculate RMSE
rmse <- function(x, y) {
round((sqrt(mean((x - y)^2, na.rm = TRUE))), 2)
}
# rmse for train_df dataset
rmse1 <- rmse(train_df, raw_avg)
# rmse for test_df dataset
rmse2 <- rmse(test_df, raw_avg)
# rmse for baseline predictors
rmse3 <- rmse(test_df, baseline)
rmse4 <- rmse(train_df, baseline)# summary of the result
kable(cbind(rmse1, rmse2, rmse3, rmse4), col.names = rep(c("Train", "Test"), 2),caption = "Summary") %>%
add_header_above(c("Raw Average" = 2, "Baseline Predictor" = 2)) %>%
kable_styling(bootstrap_options = c("striped", "bordered", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") | Train | Test | Train | Test |
|---|---|---|---|
| 1.44 | 1.58 | 1.99 | 1.29 |