This system recommends the chain restaurants on the Old Country Road to customers.

# Toy Data set
row_name <- c("A","B","C","D","E")
col_name<-c("Outback Steakhouse","Applebee's","Olive Garden","Red Lobster", "Cheesecake Factory")
m <- matrix(c(4,3,NA,5,4,3,1,2,2,3,NA,2,NA,4,3,3,NA,3,4,NA,2,4,1,NA,4),nrow=5,byrow=TRUE, dimnames = list(row_name,col_name))
m

##   Outback Steakhouse Applebee's Olive Garden Red Lobster
## A                  4          3           NA           5
## B                  3          1            2           2
## C                 NA          2           NA           4
## D                  3         NA            3           4
## E                  2          4            1          NA
##   Cheesecake Factory
## A                  4
## B                  3
## C                  3
## D                 NA
## E                  4

# Split training and testing matrix, and replace the value from test set into "NA"
training <- c(4,3,NA,NA,4,NA,1,2,NA,3,NA,2,NA,4,3,3,NA,NA,4,NA,2,4,1,NA,NA)
training_m <- matrix(c(4,3,NA,NA,4,NA,1,2,NA,3,NA,2,NA,4,3,3,NA,NA,4,NA,2,4,1,NA,NA),nrow=5,byrow=TRUE, dimnames = list(row_name,col_name))
testing_m <-c(5,3,2,3,4)

# raw average
raw_average <- mean(training,na.rm=TRUE)
raw_average

## [1] 2.857143

rmse_training <- sqrt(mean((training-raw_average)^2,na.rm = TRUE))
rmse_training

## [1] 1.059457

rmse_testing <- sqrt(mean((testing_m-raw_average)^2))
rmse_testing

## [1] 1.15529

# Find bias
training_m

##   Outback Steakhouse Applebee's Olive Garden Red Lobster
## A                  4          3           NA          NA
## B                 NA          1            2          NA
## C                 NA          2           NA           4
## D                  3         NA           NA           4
## E                  2          4            1          NA
##   Cheesecake Factory
## A                  4
## B                  3
## C                  3
## D                 NA
## E                 NA

user_mean <- rowMeans(training_m,na.rm=TRUE)
user_mean

##        A        B        C        D        E 
## 3.666667 2.000000 3.000000 3.500000 2.333333

user_bias <- user_mean-raw_average
user_bias

##          A          B          C          D          E 
##  0.8095238 -0.8571429  0.1428571  0.6428571 -0.5238095

food_mean <- colMeans(training_m,na.rm=TRUE)
food_mean

## Outback Steakhouse         Applebee's       Olive Garden 
##           3.000000           2.500000           1.500000 
##        Red Lobster Cheesecake Factory 
##           4.000000           3.333333

food_bias <- food_mean - raw_average
food_bias

## Outback Steakhouse         Applebee's       Olive Garden 
##          0.1428571         -0.3571429         -1.3571429 
##        Red Lobster Cheesecake Factory 
##          1.1428571          0.4761905

# baseline predictor

user_bias_matrix<-matrix(c(user_bias[1],user_bias[1],user_bias[1],user_bias[1],user_bias[1],user_bias[2],user_bias[2],user_bias[2],user_bias[2],user_bias[2],user_bias[3],user_bias[3],user_bias[3],user_bias[3],user_bias[3],user_bias[4],user_bias[4],user_bias[4],user_bias[4],user_bias[4],user_bias[5],user_bias[5],user_bias[5],user_bias[5],user_bias[5]),nrow=5,byrow = TRUE)

user_bias_matrix

##            [,1]       [,2]       [,3]       [,4]       [,5]
## [1,]  0.8095238  0.8095238  0.8095238  0.8095238  0.8095238
## [2,] -0.8571429 -0.8571429 -0.8571429 -0.8571429 -0.8571429
## [3,]  0.1428571  0.1428571  0.1428571  0.1428571  0.1428571
## [4,]  0.6428571  0.6428571  0.6428571  0.6428571  0.6428571
## [5,] -0.5238095 -0.5238095 -0.5238095 -0.5238095 -0.5238095

food_bias_matrix <-matrix(c(food_bias[1],food_bias[1],food_bias[1],food_bias[1],food_bias[1],food_bias[2],food_bias[2],food_bias[2],food_bias[2],food_bias[2],food_bias[3],food_bias[3],food_bias[3],food_bias[3],food_bias[3],food_bias[4],food_bias[4],food_bias[4],food_bias[4],food_bias[4],food_bias[5],food_bias[5],food_bias[5],food_bias[5],food_bias[5]),nrow=5)

food_bias_matrix

##           [,1]       [,2]      [,3]     [,4]      [,5]
## [1,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905
## [2,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905
## [3,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905
## [4,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905
## [5,] 0.1428571 -0.3571429 -1.357143 1.142857 0.4761905

m[is.na(m)]<-0
baseline_predictor <- m+user_bias_matrix+food_bias_matrix
baseline_predictor

##   Outback Steakhouse Applebee's Olive Garden Red Lobster
## A          4.9523810  3.4523810   -0.5476190   6.9523810
## B          2.2857143 -0.2142857   -0.2142857   2.2857143
## C          0.2857143  1.7857143   -1.2142857   5.2857143
## D          3.7857143  0.2857143    2.2857143   5.7857143
## E          1.6190476  3.1190476   -0.8809524   0.6190476
##   Cheesecake Factory
## A           5.285714
## B           2.619048
## C           3.619048
## D           1.119048
## E           3.952381

# rmse for test and train
baseline_predictor[baseline_predictor<0] <-0
baseline_predictor[baseline_predictor>5]<- 5
baseline_predictor

##   Outback Steakhouse Applebee's Olive Garden Red Lobster
## A          4.9523810  3.4523810     0.000000   5.0000000
## B          2.2857143  0.0000000     0.000000   2.2857143
## C          0.2857143  1.7857143     0.000000   5.0000000
## D          3.7857143  0.2857143     2.285714   5.0000000
## E          1.6190476  3.1190476     0.000000   0.6190476
##   Cheesecake Factory
## A           5.000000
## B           2.619048
## C           3.619048
## D           1.119048
## E           3.952381

average_bp <- mean(baseline_predictor)
rmse_bp <- sqrt(mean((baseline_predictor-average_bp)^2))
rmse_bp

## [1] 1.838195

rmse_bp_test <- sqrt((0+(3-2.2857)^2+(2-2.2857)^2+(3-2.2857)^2+(4-3.952581)^2)/5)
rmse_bp_test

## [1] 0.4699621

# percent of improvement for test and train
percent_improvement_test <- (1-rmse_bp_test/rmse_testing)*100
percent_improvement_test

## [1] 59.32083

percent_improvement_training <- (1-rmse_bp/rmse_training)*100
percent_improvement_training

## [1] -73.50351

The percent of improvement for test is better which is 59%. However, the percent of improvement for the training is worse which is -74%. It means that after I did the baseline predictor, it does not improve my user-item matrix. Here are the reasons the I guess. The set is a toyset which means I made them up so there are bias in the data set. Second of all, I think when I construct this data set, I put many NA values there. They exist in the reality but for a 5 by 5 matrix, it maybe too many of them.

Data 612 Project 1

This system recommends the chain restaurants on the Old Country Road to customers.