# Toy Data set
row_name <- c("A","B","C","D","E")
col_name<-c("Outback Steakhouse","Applebee's","Olive Garden","Red Lobster", "Cheesecake Factory")
m <- matrix(c(4,3,NA,5,4,3,1,2,2,3,NA,2,NA,4,3,3,NA,3,4,NA,2,4,1,NA,4),nrow=5,byrow=TRUE, dimnames = list(row_name,col_name))
m
## Outback Steakhouse Applebee's Olive Garden Red Lobster
## A 4 3 NA 5
## B 3 1 2 2
## C NA 2 NA 4
## D 3 NA 3 4
## E 2 4 1 NA
## Cheesecake Factory
## A 4
## B 3
## C 3
## D NA
## E 4
# Split training and testing matrix, and replace the value from test set into "NA"
training <- c(4,3,NA,NA,4,NA,1,2,NA,3,NA,2,NA,4,3,3,NA,NA,4,NA,2,4,1,NA,NA)
training_m <- matrix(c(4,3,NA,NA,4,NA,1,2,NA,3,NA,2,NA,4,3,3,NA,NA,4,NA,2,4,1,NA,NA),nrow=5,byrow=TRUE, dimnames = list(row_name,col_name))
testing_m <-c(5,3,2,3,4)
# raw average
raw_average <- mean(training,na.rm=TRUE)
raw_average
## [1] 2.857143
rmse_training <- sqrt(mean((training-raw_average)^2,na.rm = TRUE))
rmse_training
## [1] 1.059457
rmse_testing <- sqrt(mean((testing_m-raw_average)^2))
rmse_testing
## [1] 1.15529
# Find bias
training_m
## Outback Steakhouse Applebee's Olive Garden Red Lobster
## A 4 3 NA NA
## B NA 1 2 NA
## C NA 2 NA 4
## D 3 NA NA 4
## E 2 4 1 NA
## Cheesecake Factory
## A 4
## B 3
## C 3
## D NA
## E NA
user_mean <- rowMeans(training_m,na.rm=TRUE)
user_mean
## A B C D E
## 3.666667 2.000000 3.000000 3.500000 2.333333
user_bias <- user_mean-raw_average
user_bias
## A B C D E
## 0.8095238 -0.8571429 0.1428571 0.6428571 -0.5238095
food_mean <- colMeans(training_m,na.rm=TRUE)
food_mean
## Outback Steakhouse Applebee's Olive Garden
## 3.000000 2.500000 1.500000
## Red Lobster Cheesecake Factory
## 4.000000 3.333333
food_bias <- food_mean - raw_average
food_bias
## Outback Steakhouse Applebee's Olive Garden
## 0.1428571 -0.3571429 -1.3571429
## Red Lobster Cheesecake Factory
## 1.1428571 0.4761905
# baseline predictor
user_bias_matrix<-matrix(c(user_bias[1],user_bias[1],user_bias[1],user_bias[1],user_bias[1],user_bias[2],user_bias[2],user_bias[2],user_bias[2],user_bias[2],user_bias[3],user_bias[3],user_bias[3],user_bias[3],user_bias[3],user_bias[4],user_bias[4],user_bias[4],user_bias[4],user_bias[4],user_bias[5],user_bias[5],user_bias[5],user_bias[5],user_bias[5]),nrow=5,byrow = TRUE)
user_bias_matrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.8095238 0.8095238 0.8095238 0.8095238 0.8095238
## [2,] -0.8571429 -0.8571429 -0.8571429 -0.8571429 -0.8571429
## [3,] 0.1428571 0.1428571 0.1428571 0.1428571 0.1428571
## [4,] 0.6428571 0.6428571 0.6428571 0.6428571 0.6428571
## [5,] -0.5238095 -0.5238095 -0.5238095 -0.5238095 -0.5238095
food_bias_matrix <-matrix(c(food_bias[1],food_bias[1],food_bias[1],food_bias[1],food_bias[1],food_bias[2],food_bias[2],food_bias[2],food_bias[2],food_bias[2],food_bias[3],food_bias[3],food_bias[3],food_bias[3],food_bias[3],food_bias[4],food_bias[4],food_bias[4],food_bias[4],food_bias[4],food_bias[5],food_bias[5],food_bias[5],food_bias[5],food_bias[5]),nrow=5)
food_bias_matrix
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905
## [2,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905
## [3,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905
## [4,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905
## [5,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905
m[is.na(m)]<-0
baseline_predictor <- m+user_bias_matrix+food_bias_matrix
baseline_predictor
## Outback Steakhouse Applebee's Olive Garden Red Lobster
## A 4.9523810 3.4523810 -0.5476190 6.9523810
## B 2.2857143 -0.2142857 -0.2142857 2.2857143
## C 0.2857143 1.7857143 -1.2142857 5.2857143
## D 3.7857143 0.2857143 2.2857143 5.7857143
## E 1.6190476 3.1190476 -0.8809524 0.6190476
## Cheesecake Factory
## A 5.285714
## B 2.619048
## C 3.619048
## D 1.119048
## E 3.952381
# rmse for test and train
baseline_predictor[baseline_predictor<0] <-0
baseline_predictor[baseline_predictor>5]<- 5
baseline_predictor
## Outback Steakhouse Applebee's Olive Garden Red Lobster
## A 4.9523810 3.4523810 0.000000 5.0000000
## B 2.2857143 0.0000000 0.000000 2.2857143
## C 0.2857143 1.7857143 0.000000 5.0000000
## D 3.7857143 0.2857143 2.285714 5.0000000
## E 1.6190476 3.1190476 0.000000 0.6190476
## Cheesecake Factory
## A 5.000000
## B 2.619048
## C 3.619048
## D 1.119048
## E 3.952381
average_bp <- mean(baseline_predictor)
rmse_bp <- sqrt(mean((baseline_predictor-average_bp)^2))
rmse_bp
## [1] 1.838195
rmse_bp_test <- sqrt((0+(3-2.2857)^2+(2-2.2857)^2+(3-2.2857)^2+(4-3.952581)^2)/5)
rmse_bp_test
## [1] 0.4699621
# percent of improvement for test and train
percent_improvement_test <- (1-rmse_bp_test/rmse_testing)*100
percent_improvement_test
## [1] 59.32083
percent_improvement_training <- (1-rmse_bp/rmse_training)*100
percent_improvement_training
## [1] -73.50351
The percent of improvement for test is better which is 59%. However, the percent of improvement for the training is worse which is -74%. It means that after I did the baseline predictor, it does not improve my user-item matrix. Here are the reasons the I guess. The set is a toyset which means I made them up so there are bias in the data set. Second of all, I think when I construct this data set, I put many NA values there. They exist in the reality but for a 5 by 5 matrix, it maybe too many of them.