library(data.table);library(tidyverse);
library(caret);library(randomForest);library(neuralnet);library(NeuralNetTools)
library(rattle);library(DMwR)
df <- read.csv('https://raw.githubusercontent.com/FewPila/Proj_workshop/main/Retail.csv',header = T)
df$Month_Yr <- as.Date(df$Month_Yr)
mape <- function(predict,actual){
return(mean(abs((actual-predict)/actual)) * 100)
}
image
#split 75% train, 25 % test
size <- floor(0.75*nrow(df))
train <- df[1:size,]
test <- df[(size+1):nrow(df),]
plot <- rbind(train %>% select(Month_Yr,Sales) %>% mutate(type = 'train'),
test %>% select(Month_Yr,Sales) %>% mutate(type = 'test'))
ggplot(plot,aes(x=Month_Yr,y=Sales,color = type)) + geom_line(size = 1.5) + ggtitle('Train + Test plot')
#actual
ggplot(df,aes(x=Month_Yr,y = Sales)) + geom_line(size= 1.5) + ggtitle('Actual Plot')
#denote formula
fml <- formula(Sales ~ Customers + Open + StateHoliday)
##Linear Regression
####### modeling
model_lm <- train(fml,data = train,
method = 'lm')
model_lm
## Linear Regression
##
## 23 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 23, 23, 23, 23, 23, 23, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2939640 0.9733211 2500899
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
#test
y_pred_lm <- predict(model_lm,newdata = test %>% select(-Sales))
postResample(y_pred_lm,test$Sales)
## RMSE Rsquared MAE
## 9.081654e+06 9.317436e-01 8.644700e+06
mape(y_pred_lm,test$Sales)
## [1] 4.318548
### randomForest
set.seed(987)
model_rf <- train(fml,data = train,
method = 'rf',
metric = 'RMSE',
ntree = 500)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
model_rf
## Random Forest
##
## 23 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 23, 23, 23, 23, 23, 23, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 9014983 0.7409417 6638188
## 3 8154973 0.7909153 5937225
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
#predict
y_pred_rf <- predict(model_rf,test %>% select(-Sales))
#mesuare
postResample(y_pred_rf,test$Sales)
## RMSE Rsquared MAE
## 1.080220e+07 8.787360e-01 1.019029e+07
mape(y_pred_rf,test$Sales)
## [1] 5.086864
train2 <- train %>% select(-Month_Yr)
## Scale data for neural network
max = apply(train2 , 2 , max)
min = apply(train2, 2 , min)
TrainNN = as.data.frame(scale(train2, center = min, scale = max - min))
# fit neural network
set.seed(2)
NN = neuralnet(fml,data = TrainNN, hidden = 3 , linear.output = T )
# plot neural network
plot(NN)
test2 <- test %>% select(-Month_Yr)
## Scale data for neural network
max = apply(test2 , 2 , max)
min = apply(test2, 2 , min)
testNN = as.data.frame(scale(test2, center = min, scale = max - min))
testNN
## Sales Customers Open Promo StateHoliday
## 24 0.7003561 0.4758304 0.0000000 0.0000000 0.6666667
## 25 0.6041113 0.5875909 0.7871864 1.0000000 0.6666667
## 26 0.0000000 0.0000000 0.4890980 0.2440678 0.0000000
## 27 0.8034804 0.8397670 0.8345281 0.5464407 0.0000000
## 28 0.5769477 0.5672159 0.5135902 0.5464407 0.6666667
## 29 0.3192172 0.2569757 0.3566308 0.3952542 1.0000000
## 30 0.8543419 0.7804301 0.7364098 0.5464407 0.3333333
## 31 1.0000000 1.0000000 1.0000000 0.6976271 0.0000000
predict_testNN = compute(NN, testNN[,c(2:4)])
predict_testNN$net.result
## [,1]
## 24 0.51473157
## 25 0.44676548
## 26 -0.01931064
## 27 0.70231738
## 28 0.50983217
## 29 0.24055921
## 30 0.67002229
## 31 0.80578400
predict_testNN = (predict_testNN$net.result * (max(df$Sales) - min(df$Sales))) + min(df$Sales)
predict_testNN
## [,1]
## 24 198730706
## 25 194111589
## 26 162436083
## 27 211479429
## 28 198397733
## 29 180097380
## 30 209284587
## 31 218511237
postResample(predict_testNN,test$Sales)
## RMSE Rsquared MAE
## 7.441650e+06 9.811737e-01 5.930543e+06
mape(predict_testNN,test$Sales)
## [1] 3.08145
image
CV10 <- trainControl(method = 'cv',number = 10)
### randomForest
set.seed(987)
model_rf <- train(fml,data = train,
method = 'rf',
metric = 'RMSE',
ntree = 50,
trControl = CV10)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
model_rf
## Random Forest
##
## 23 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 21, 21, 20, 21, 21, 21, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 7061872 0.8885345 5790342
## 3 5771025 0.8911296 4833213
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
#predict
y_pred_rf <- predict(model_rf,test %>% select(-Sales))
#mesuare
postResample(y_pred_rf,test$Sales)
## RMSE Rsquared MAE
## 1.090743e+07 8.872654e-01 1.034696e+07
mape(y_pred_rf,test$Sales)
## [1] 5.169578
CV5 <- trainControl(method = 'cv',
number = 5)
### randomForest
set.seed(987)
model_rf <- train(fml,data = train,
method = 'rf',
metric = 'RMSE',
ntree = 10,
trControl = CV5)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
model_rf
## Random Forest
##
## 23 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 19, 18, 19, 17, 19
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 8667131 0.8308711 6707059
## 3 7833293 0.8815515 5928980
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
#predict
y_pred_rf <- predict(model_rf,test %>% select(-Sales))
#mesuare
postResample(y_pred_rf,test$Sales)
## RMSE Rsquared MAE
## 1.077017e+07 8.576839e-01 1.006039e+07
mape(y_pred_rf,test$Sales)
## [1] 5.032735
new_data <- read.csv('https://raw.githubusercontent.com/FewPila/Proj_workshop/main/Suicide_TH.csv')