library(data.table);library(tidyverse);
library(caret);library(randomForest);library(neuralnet);library(NeuralNetTools)
library(rattle);library(DMwR)

train Test

df <- read.csv('https://raw.githubusercontent.com/FewPila/Proj_workshop/main/Retail.csv',header = T)
df$Month_Yr <- as.Date(df$Month_Yr)
mape <- function(predict,actual){
  return(mean(abs((actual-predict)/actual)) * 100)
}
image

image

#split 75% train, 25 % test
size <- floor(0.75*nrow(df))
train <- df[1:size,]
test <- df[(size+1):nrow(df),]

Train Test vs Actual

plot <- rbind(train %>% select(Month_Yr,Sales) %>% mutate(type = 'train'),
              test %>% select(Month_Yr,Sales) %>% mutate(type = 'test'))
ggplot(plot,aes(x=Month_Yr,y=Sales,color = type)) + geom_line(size = 1.5) + ggtitle('Train + Test plot')

#actual
ggplot(df,aes(x=Month_Yr,y = Sales)) + geom_line(size= 1.5) + ggtitle('Actual Plot')

time to predict

#denote formula
fml <- formula(Sales ~ Customers + Open + StateHoliday)

##Linear Regression

####### modeling
model_lm <- train(fml,data = train,
                  method = 'lm')
model_lm
## Linear Regression 
## 
## 23 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 23, 23, 23, 23, 23, 23, ... 
## Resampling results:
## 
##   RMSE     Rsquared   MAE    
##   2939640  0.9733211  2500899
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
#test 
y_pred_lm <- predict(model_lm,newdata = test %>% select(-Sales))
postResample(y_pred_lm,test$Sales)
##         RMSE     Rsquared          MAE 
## 9.081654e+06 9.317436e-01 8.644700e+06
mape(y_pred_lm,test$Sales)
## [1] 4.318548

RandomForest

### randomForest
set.seed(987)
model_rf <- train(fml,data = train,
                  method = 'rf',
                  metric = 'RMSE',
                  ntree = 500)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
model_rf
## Random Forest 
## 
## 23 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 23, 23, 23, 23, 23, 23, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE     Rsquared   MAE    
##   2     9014983  0.7409417  6638188
##   3     8154973  0.7909153  5937225
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
#predict
y_pred_rf <- predict(model_rf,test %>% select(-Sales))
#mesuare
postResample(y_pred_rf,test$Sales)
##         RMSE     Rsquared          MAE 
## 1.080220e+07 8.787360e-01 1.019029e+07
mape(y_pred_rf,test$Sales)
## [1] 5.086864
train2 <- train %>% select(-Month_Yr)
## Scale data for neural network
max = apply(train2 , 2 , max)
min = apply(train2, 2 , min)
TrainNN = as.data.frame(scale(train2, center = min, scale = max - min))
# fit neural network
set.seed(2)
NN = neuralnet(fml,data = TrainNN, hidden = 3 , linear.output = T )
# plot neural network
plot(NN)
test2 <- test %>% select(-Month_Yr)
## Scale data for neural network
max = apply(test2 , 2 , max)
min = apply(test2, 2 , min)
testNN = as.data.frame(scale(test2, center = min, scale = max - min))
testNN
##        Sales Customers      Open     Promo StateHoliday
## 24 0.7003561 0.4758304 0.0000000 0.0000000    0.6666667
## 25 0.6041113 0.5875909 0.7871864 1.0000000    0.6666667
## 26 0.0000000 0.0000000 0.4890980 0.2440678    0.0000000
## 27 0.8034804 0.8397670 0.8345281 0.5464407    0.0000000
## 28 0.5769477 0.5672159 0.5135902 0.5464407    0.6666667
## 29 0.3192172 0.2569757 0.3566308 0.3952542    1.0000000
## 30 0.8543419 0.7804301 0.7364098 0.5464407    0.3333333
## 31 1.0000000 1.0000000 1.0000000 0.6976271    0.0000000
predict_testNN = compute(NN, testNN[,c(2:4)])
predict_testNN$net.result
##           [,1]
## 24  0.51473157
## 25  0.44676548
## 26 -0.01931064
## 27  0.70231738
## 28  0.50983217
## 29  0.24055921
## 30  0.67002229
## 31  0.80578400
predict_testNN = (predict_testNN$net.result * (max(df$Sales) - min(df$Sales))) + min(df$Sales)
predict_testNN
##         [,1]
## 24 198730706
## 25 194111589
## 26 162436083
## 27 211479429
## 28 198397733
## 29 180097380
## 30 209284587
## 31 218511237
postResample(predict_testNN,test$Sales)
##         RMSE     Rsquared          MAE 
## 7.441650e+06 9.811737e-01 5.930543e+06
mape(predict_testNN,test$Sales)
## [1] 3.08145

Tuning

Cross validation

image

image

CV10 <- trainControl(method = 'cv',number = 10)
### randomForest
set.seed(987)
model_rf <- train(fml,data = train,
                  method = 'rf',
                  metric = 'RMSE',
                  ntree = 50,
                  trControl = CV10)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
model_rf
## Random Forest 
## 
## 23 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 21, 21, 20, 21, 21, 21, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE     Rsquared   MAE    
##   2     7061872  0.8885345  5790342
##   3     5771025  0.8911296  4833213
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
#predict
y_pred_rf <- predict(model_rf,test %>% select(-Sales))
#mesuare
postResample(y_pred_rf,test$Sales)
##         RMSE     Rsquared          MAE 
## 1.090743e+07 8.872654e-01 1.034696e+07
mape(y_pred_rf,test$Sales)
## [1] 5.169578
CV5 <- trainControl(method = 'cv',
                        number = 5)
### randomForest
set.seed(987)
model_rf <- train(fml,data = train,
                  method = 'rf',
                  metric = 'RMSE',
                  ntree = 10,
                  trControl = CV5)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
model_rf
## Random Forest 
## 
## 23 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 19, 18, 19, 17, 19 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE     Rsquared   MAE    
##   2     8667131  0.8308711  6707059
##   3     7833293  0.8815515  5928980
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
#predict
y_pred_rf <- predict(model_rf,test %>% select(-Sales))
#mesuare
postResample(y_pred_rf,test$Sales)
##         RMSE     Rsquared          MAE 
## 1.077017e+07 8.576839e-01 1.006039e+07
mape(y_pred_rf,test$Sales)
## [1] 5.032735

Hidden State

# fit neural network
set.seed(2)
NN = neuralnet(fml,data = TrainNN, hidden = c(3,3,2) , linear.output = T )# fit neural network
plot(NN)
predict_testNN = compute(NN, testNN[,c(2:4)])
predict_testNN = (predict_testNN$net.result * (max(df$Sales) - min(df$Sales))) + min(df$Sales)
postResample(predict_testNN,test$Sales)
##         RMSE     Rsquared          MAE 
## 8.022601e+06 9.347481e-01 7.261999e+06
mape(predict_testNN,test$Sales)
## [1] 3.737429

Workshop

new_data <- read.csv('https://raw.githubusercontent.com/FewPila/Proj_workshop/main/Suicide_TH.csv')