This system recommends scents to buyers. Each scent is given a score on the scale from 0 to 10. This is a toy data set with 7 scents and 10 buyers.
# reading training data (extracted values for the test data set has been replaced with NAs)
train.data = read.csv("train")
train.data
## buyer scent1 scent2 scent3 scent4 scent5 scent6 scent7
## 1 buyer1 7 NA 10 8 7 6 9
## 2 buyer2 8 7 6 NA 9 10 8
## 3 buyer3 NA NA NA NA NA 7 8
## 4 buyer4 7 NA NA NA NA 10 8
## 5 buyer5 7 NA NA NA NA 6 7
## 6 buyer6 7 NA NA NA NA 9 9
## 7 buyer7 8 9 7 NA 6 8 9
## 8 buyer8 NA 5 6 7 7 6 7
## 9 buyer9 4 NA 3 5 NA 5 6
## 10 buyer10 8 7 8 9 NA 9 9
# reading test data extracted from training data
test.data = read.csv("test")
test.data
## buyer scent2 scent3 scent4 scent5
## 1 buyer3 9 9 8 8
## 2 buyer4 9 5 8 8
## 3 buyer5 5 5 5 8
## 4 buyer6 9 8 8 6
Calculating the raw average rating for every user-item combination (training set).
# creating matrix with only numeric values or NAs
data.numcols.train <- data.matrix(train.data[, sapply(train.data, is.numeric)])
data.numcols.test <- data.matrix(test.data[, sapply(test.data, is.numeric)])
# calculating raw average rating for every user-item combination.
raw_avg = function(df){
sum(apply(df, 2, function(x) sum(x, na.rm = TRUE)))/length(which(!is.na(df)))
}
# raw average rating (training set)
raw_avg_train = raw_avg(data.numcols.train)
raw_avg_train
## [1] 7.347826
# Calculate the RMSE for raw average for both your training data and your test data.
RMSE_raw_avg = function(df, raw_avg_df){
sqrt(mean(((df - raw_avg_df)^2), na.rm=TRUE))
}
# the RMSE for raw average for training set
RMSE_raw_avg(data.numcols.train,raw_avg_train)
## [1] 1.549437
# the RMSE for raw average for test set
RMSE_raw_avg(data.numcols.test,raw_avg_train)
## [1] 1.536266
# calculating the bias for each user and each item (training set)
bias_scent = function(df){
colMeans(df, na.rm=TRUE) - raw_avg(df)
}
# the bias for each item (training set)
bias_scent(data.numcols.train)
## scent1 scent2 scent3 scent4 scent5 scent6
## -0.34782609 -0.34782609 -0.68115942 -0.09782609 -0.09782609 0.25217391
## scent7
## 0.65217391
bias_buyer = function(df){
rowMeans(df, na.rm=TRUE) - raw_avg(df)
}
# the bias for each buyer (training set)
bias_buyer(data.numcols.train)
## [1] 0.4855072 0.6521739 0.1521739 0.9855072 -0.6811594 0.9855072
## [7] 0.4855072 -1.0144928 -2.7478261 0.9855072
# calculating the baseline predictors for every user-item combination
baseline = function(df, nc, nr){
baseline_mtrx<- matrix(ncol=nc,nrow=nr)
i =1
j =1
coll_bias_scent = bias_scent(df)
coll_bias_buyer = bias_buyer(df)
for(i in (1:length(coll_bias_scent))) {
for(j in (1: length(coll_bias_buyer))) {
baseline_mtrx[j,i] = raw_avg_train + coll_bias_scent[i]+coll_bias_buyer[j]
}
}
baseline_mtrx
}
# the baseline predictors for every user-item combination
baseline_train_set = baseline(data.numcols.train, 7,10)
colnames(baseline_train_set)<-c("scent1","scent2","scent3","scent4","scent5","scent6","scent7")
rownames(baseline_train_set)<-c("buyer1","buyer2","buyer3","buyer4","buyer5","buyer6","buyer7","buyer8","buyer9","buyer10")
baseline_train_set
## scent1 scent2 scent3 scent4 scent5 scent6 scent7
## buyer1 7.485507 7.485507 7.152174 7.735507 7.735507 8.085507 8.485507
## buyer2 7.652174 7.652174 7.318841 7.902174 7.902174 8.252174 8.652174
## buyer3 7.152174 7.152174 6.818841 7.402174 7.402174 7.752174 8.152174
## buyer4 7.985507 7.985507 7.652174 8.235507 8.235507 8.585507 8.985507
## buyer5 6.318841 6.318841 5.985507 6.568841 6.568841 6.918841 7.318841
## buyer6 7.985507 7.985507 7.652174 8.235507 8.235507 8.585507 8.985507
## buyer7 7.485507 7.485507 7.152174 7.735507 7.735507 8.085507 8.485507
## buyer8 5.985507 5.985507 5.652174 6.235507 6.235507 6.585507 6.985507
## buyer9 4.252174 4.252174 3.918841 4.502174 4.502174 4.852174 5.252174
## buyer10 7.985507 7.985507 7.652174 8.235507 8.235507 8.585507 8.985507
baseline_test_set<-baseline_train_set[3:6,2:5]
baseline_test_set
## scent2 scent3 scent4 scent5
## buyer3 7.152174 6.818841 7.402174 7.402174
## buyer4 7.985507 7.652174 8.235507 8.235507
## buyer5 6.318841 5.985507 6.568841 6.568841
## buyer6 7.985507 7.652174 8.235507 8.235507
# the RMSE for the baseline predictors for training data
# Calculate the RMSE for raw average for both your training data and your test data.
RMSE_baseline = function(df, baseline_df){
sqrt(mean(((df - baseline_df)^2), na.rm=TRUE))
}
# the RMSE for raw average for training set
RMSE_baseline(data.numcols.train,baseline_train_set)
## [1] 0.9287973
# the RMSE for the baseline predictors for test data
RMSE_baseline(data.numcols.test,baseline_test_set)
## [1] 1.380309
# summarizing results
m0<-cbind(RMSE_RAW_AVG_TRAIN=RMSE_raw_avg(data.numcols.train,raw_avg_train),RMSE_BASELINE_TRAIN=RMSE_baseline(data.numcols.train,baseline_train_set), improvemets_pct = round((1-(RMSE_baseline(data.numcols.train,baseline_train_set)/RMSE_raw_avg(data.numcols.train,raw_avg_train))),3)*100)
m1<-cbind(RMSE_RAW_AVG_TEST=RMSE_raw_avg(data.numcols.test,raw_avg_train),RMSE_BASELINE_TEST=RMSE_baseline(data.numcols.test,baseline_test_set), improvemets_pct = round((1-(RMSE_baseline(data.numcols.test,baseline_test_set)/RMSE_raw_avg(data.numcols.test,raw_avg_train))),3)*100)
summary = rbind(m0, m1)
rownames(summary) <- c("training set","test set")
summary
## RMSE_RAW_AVG_TRAIN RMSE_BASELINE_TRAIN improvemets_pct
## training set 1.549437 0.9287973 40.1
## test set 1.536266 1.3803088 10.2
RMSE is lower for the baseline predictor compare to raw average predictor: 40% improvements on the training set and 10% on the test set.