library(data.table);library(tidyverse);
library(caret);library(randomForest);library(neuralnet);library(NeuralNetTools)
library(rattle);library(DMwR)
df <- read.csv('https://raw.githubusercontent.com/FewPila/Proj_workshop/main/Retail.csv',header = T)
df$Month_Yr <- as.Date(df$Month_Yr)
df
##      Month_Yr     Sales Customers  Open Promo StateHoliday
## 1  2013-01-01 180132207  20380423 28869 11150            2
## 2  2013-02-01 171534275  19244468 26683 11150            0
## 3  2013-03-01 201180369  21969462 27892 16725            1
## 4  2013-04-01 183431432  20882365 27880 13380            1
## 5  2013-05-01 185411063  20723886 26202 14495            4
## 6  2013-06-01 180702351  20473046 27942 11150            0
## 7  2013-07-01 208843882  22872045 30166 14495            0
## 8  2013-08-01 198042727  22314232 30025 13380            1
## 9  2013-09-01 178053963  20350031 27981 11150            0
## 10 2013-10-01 187662330  21371258 28990 11150            2
## 11 2013-11-01 196170924  21543868 28412 11150            2
## 12 2013-12-01 231710561  23879341 26901 11150            2
## 13 2014-01-01 187752787  20893187 28711 11150            2
## 14 2014-02-01 178924677  19972290 26793 11150            0
## 15 2014-03-01 193019854  21486733 29008 12265            0
## 16 2014-04-01 194544977  21345653 26920 13380            2
## 17 2014-05-01 194693257  21505750 28021 13380            2
## 18 2014-06-01 190047051  20579912 26211 12265            2
## 19 2014-07-01 173892707  19299298 25233 12155            0
## 20 2014-08-01 163748475  18516451 24388 10285            0
## 21 2014-09-01 164516168  18667949 24349 11220            0
## 22 2014-10-01 164216398  18734892 24302 12155            2
## 23 2014-11-01 173327953  18686232 22989 14025            2
## 24 2014-12-01 202120592  20800624 23492  9350            2
## 25 2015-01-01 198843730  21110481 28763 16725            2
## 26 2015-02-01 178275458  19481381 26767 11150            0
## 27 2015-03-01 205631683  21809641 29080 13380            0
## 28 2015-04-01 197918886  21053991 26931 13380            2
## 29 2015-05-01 189143897  20193848 25880 12265            3
## 30 2015-06-01 207363373  21645129 28423 13380            1
## 31 2015-07-01 212322616  22253888 30188 14495            0
#denote formula
fml <- formula(Sales ~ Customers + Open + StateHoliday)

Linear Model

####### modeling
model_lm <- lm(fml,data = df)
summary(model_lm)
## 
## Call:
## lm(formula = fml, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -6146361 -3315195  -977541  3289886  8917916 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.293e+07  1.413e+07  -2.331  0.02748 *  
## Customers     1.391e+01  1.017e+00  13.675 1.18e-13 ***
## Open         -2.425e+03  6.832e+02  -3.550  0.00144 ** 
## StateHoliday -4.503e+05  8.473e+05  -0.532  0.59941    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4682000 on 27 degrees of freedom
## Multiple R-squared:  0.9183, Adjusted R-squared:  0.9092 
## F-statistic: 101.1 on 3 and 27 DF,  p-value: 8.49e-15
mape <- function(predict,actual){
  return(mean(abs((actual-predict)/actual)) * 100)
}
#predict
y_pred_lm <- predict(model_lm,df %>% select(-Sales))
#mesuare
postResample(y_pred_lm,df$Sales)
##         RMSE     Rsquared          MAE 
## 4.369146e+06 9.182762e-01 3.677101e+06
mape(y_pred_lm,df$Sales)
## [1] 1.919857
RMSE calculation

RMSE calculation

Desicion Tree

#### desicion Tree 
set.seed(987)
model_tree <- train(fml,data = df,
                    method = 'rpart',
                    metric = 'RMSE')
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
model_tree
## CART 
## 
## 31 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 31, 31, 31, 31, 31, 31, ... 
## Resampling results across tuning parameters:
## 
##   cp         RMSE      Rsquared   MAE     
##   0.0000000  10554865  0.5877755   8382354
##   0.2941975  11812790  0.4983481   9454266
##   0.5883950  13330297  0.4693688  11095501
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.
#predict
y_pred_tree <- predict(model_tree,df %>% select(-Sales))
#mesuare
postResample(y_pred_tree,df$Sales)
##         RMSE     Rsquared          MAE 
## 9.805339e+06 5.883950e-01 7.407776e+06
mape(y_pred_tree,df$Sales)
## [1] 3.86145
fancyRpartPlot(model_tree$finalModel)

Other example

Others Tree

Others Tree

RandomForest

### randomForest
set.seed(987)
model_rf <- train(fml,data = df,
                  method = 'rf',
                  metric = 'RMSE',
                  ntree = 100)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
model_rf
## Random Forest 
## 
## 31 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 31, 31, 31, 31, 31, 31, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE     Rsquared   MAE    
##   2     9517887  0.6832149  7335288
##   3     8933300  0.7326713  6948379
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
#predict
y_pred_rf <- predict(model_rf,df %>% select(-Sales))
#mesuare
postResample(y_pred_rf,df$Sales)
##         RMSE     Rsquared          MAE 
## 3.637387e+06 9.514300e-01 2.827338e+06
mape(y_pred_rf,df$Sales)
## [1] 1.463529

example of RF

Neuralnetwork

### neuralnetwork
## Scale data for neural network
df2 <- df %>% select(-Month_Yr)
max = apply(df2 , 2 , max)
min = apply(df2, 2 , min)
Scaled_df = as.data.frame(scale(df2, center = min, scale = max - min))

ค่าสถิติต่างๆ normalize เป็น min = 0, max = 1

summary(Scaled_df)
##      Sales          Customers           Open            Promo       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2185   1st Qu.:0.2921   1st Qu.:0.4469   1st Qu.:0.2441  
##  Median :0.3737   Median :0.4412   Median :0.6794   Median :0.3953  
##  Mean   :0.3783   Mean   :0.4212   Mean   :0.5903   Mean   :0.4297  
##  3rd Qu.:0.5105   3rd Qu.:0.5610   3rd Qu.:0.7984   3rd Qu.:0.5464  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##   StateHoliday   
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.2500  
##  Mean   :0.2984  
##  3rd Qu.:0.5000  
##  Max.   :1.0000
fml
## Sales ~ Customers + Open + StateHoliday

เลือก data ของเราให้มี column เหมือนกับตัวสมการ

data <- Scaled_df %>% select(Sales,Customers,Open,StateHoliday)
# fit neural network
set.seed(2)
NN = neuralnet(fml,data = data,
               hidden = 3 , linear.output = T )
plot(NN)
data
##          Sales  Customers       Open StateHoliday
## 1  0.241071647 0.34756857 0.81678011         0.50
## 2  0.114560933 0.13575087 0.51312682         0.00
## 3  0.550776119 0.64387131 0.68106681         0.25
## 4  0.289616728 0.44116400 0.67939992         0.25
## 5  0.318745190 0.41161295 0.44631199         1.00
## 6  0.249460795 0.36483967 0.68801222         0.00
## 7  0.663537711 0.81217291 0.99694402         0.00
## 8  0.504608584 0.70815941 0.97735797         0.25
## 9  0.210492185 0.34190147 0.69342964         0.00
## 10 0.351870527 0.53232623 0.83358800         0.50
## 11 0.477066713 0.56451223 0.75329907         0.50
## 12 1.000000000 1.00000000 0.54340881         0.50
## 13 0.353201519 0.44318194 0.79483262         0.50
## 14 0.223303946 0.27146539 0.52840672         0.00
## 15 0.430701597 0.55385846 0.83608835         0.00
## 16 0.453142389 0.52755175 0.54604806         0.50
## 17 0.455324194 0.55740450 0.69898597         0.50
## 18 0.386959517 0.38476661 0.44756216         0.50
## 19 0.149263105 0.14597484 0.31170996         0.00
## 20 0.000000000 0.00000000 0.19433255         0.00
## 21 0.011295901 0.02824932 0.18891513         0.00
## 22 0.006885059 0.04073196 0.18238644         0.50
## 23 0.140953266 0.03165849 0.00000000         0.50
## 24 0.564610642 0.42592203 0.06987082         0.50
## 25 0.516394612 0.48370002 0.80205584         0.50
## 26 0.213751282 0.17992724 0.52479511         0.00
## 27 0.616273138 0.61407003 0.84608973         0.00
## 28 0.502786377 0.47316652 0.54757605         0.50
## 29 0.373670431 0.31277856 0.40158355         0.75
## 30 0.641753374 0.58339403 0.75482706         0.25
## 31 0.714724104 0.69690726 1.00000000         0.00

ได้ค่าเป็น Scale

#เลือกตาม feature เรา
pred_NN = compute(NN, Scaled_df %>% select(-Sales))
pred_NN$net.result
##               [,1]
##  [1,]  0.247017644
##  [2,]  0.092995899
##  [3,]  0.603741796
##  [4,]  0.385132825
##  [5,]  0.388334611
##  [6,]  0.307442821
##  [7,]  0.725236316
##  [8,]  0.582831962
##  [9,]  0.280445398
## [10,]  0.418744447
## [11,]  0.474569780
## [12,]  0.998520658
## [13,]  0.343285292
## [14,]  0.240418631
## [15,]  0.480975932
## [16,]  0.496728980
## [17,]  0.483509857
## [18,]  0.378091473
## [19,]  0.140919787
## [20,] -0.006021789
## [21,]  0.026028378
## [22,]  0.094580196
## [23,]  0.104946314
## [24,]  0.494850000
## [25,]  0.380394053
## [26,]  0.139183725
## [27,]  0.546502639
## [28,]  0.441223948
## [29,]  0.317752763
## [30,]  0.515926211
## [31,]  0.594967110

Unscale กลับเป็นค่าจริง

pred_NN = (pred_NN$net.result * (max(df2$Sales) - min(df2$Sales))) + min(df2$Sales)
pred_NN
##            [,1]
##  [1,] 180536309
##  [2,] 170068670
##  [3,] 204780027
##  [4,] 189922905
##  [5,] 190140505
##  [6,] 184642930
##  [7,] 213037048
##  [8,] 203358951
##  [9,] 182808129
## [10,] 192207221
## [11,] 196001227
## [12,] 231610022
## [13,] 187078860
## [14,] 180087827
## [15,] 196436603
## [16,] 197507213
## [17,] 196608813
## [18,] 189444360
## [19,] 173325678
## [20,] 163339222
## [21,] 165517418
## [22,] 170176342
## [23,] 170880845
## [24,] 197379513
## [25,] 189600848
## [26,] 173207691
## [27,] 200889934
## [28,] 193734975
## [29,] 185343616
## [30,] 198811896
## [31,] 204183681

ทำนาย

#predict
postResample(pred_NN,df$Sales)
##         RMSE     Rsquared          MAE 
## 4.329671e+06 9.197548e-01 3.525694e+06
#MAPE
mape(pred_NN,df$Sales)
## [1] 1.842063

ensemble plot

the_selector <- data.frame('Date' = df$Month_Yr,
                           'Sales' = df$Sales,
                           'lm_pred' = y_pred_lm,
                           'tree_pred' = y_pred_tree,
                           'rf_pred' = y_pred_rf,
                           'nn_pred' = pred_NN)
the_selector
##          Date     Sales   lm_pred tree_pred   rf_pred   nn_pred
## 1  2013-01-01 180132207 179643479 174705711 181731171 180536309
## 2  2013-02-01 171534275 170045096 174705711 172651065 170068670
## 3  2013-03-01 201180369 204566782 198774321 204310251 204780027
## 4  2013-04-01 183431432 189474498 198774321 188762857 189922905
## 5  2013-05-01 185411063 189988918 198774321 188742499 190140505
## 6  2013-06-01 180702351 184080876 174705711 181278386 184642930
## 7  2013-07-01 208843882 212056550 198774321 207697276 213037048
## 8  2013-08-01 198042727 204189088 198774321 203605455 203358951
## 9  2013-09-01 178053963 182275161 174705711 180043071 182808129
## 10 2013-10-01 187662330 193132401 198774321 190992043 192207221
## 11 2013-11-01 196170924 196935274 198774321 195313583 196001227
## 12 2013-12-01 231710561 233086223 198774321 220423121 231610022
## 13 2014-01-01 187752787 187159181 198774321 189320921 187078860
## 14 2014-02-01 178924677 179902218 174705711 179288102 180087827
## 15 2014-03-01 193019854 195595652 198774321 192982256 196436603
## 16 2014-04-01 194544977 197796842 198774321 195658644 197507213
## 17 2014-05-01 194693257 197353393 198774321 195247498 196608813
## 18 2014-06-01 190047051 188865093 198774321 186871684 189444360
## 19 2014-07-01 173892707 174324622 174705711 172358122 173325678
## 20 2014-08-01 163748475 165484787 174705711 165990512 163339222
## 21 2014-09-01 164516168 167686697 174705711 166595246 165517418
## 22 2014-10-01 164216398 167831196 174705711 168120816 170176342
## 23 2014-11-01 173327953 170338907 174705711 169256010 170880845
## 24 2014-12-01 202120592 198529866 198774321 196900579 197379513
## 25 2015-01-01 198843730 190055593 198774321 194344603 189600848
## 26 2015-02-01 178275458 173136793 174705711 176615594 173207691
## 27 2015-03-01 205631683 199912634 198774321 204618992 200889934
## 28 2015-04-01 197918886 193713179 198774321 197196735 193734975
## 29 2015-05-01 189143897 183847468 174705711 184246453 185343616
## 30 2015-06-01 207363373 198767455 198774321 202082373 198811896
## 31 2015-07-01 212322616 203404700 198774321 207869405 204183681
plot_data <- the_selector %>% gather('Type','Sales',Sales,lm_pred,tree_pred,rf_pred,nn_pred)
plot_data %>% ggplot(aes(x=Date,y=Sales,colour = Type)) + geom_line()

#filter only 2 model LM&RF
plot_data2 <- plot_data %>% filter(Type != 'tree_pred',
                                   Type != 'nn_pred')
plot_data2 %>% ggplot(aes(x=Date,y=Sales,colour = Type,alpha = Type)) + geom_line(size = 1.5) +
  scale_alpha_manual(values = c(0.45,0.6,1))