library(data.table);library(tidyverse);
library(caret);library(randomForest);library(neuralnet);library(NeuralNetTools)
library(rattle);library(DMwR)
df <- read.csv('https://raw.githubusercontent.com/FewPila/Proj_workshop/main/Retail.csv',header = T)
df$Month_Yr <- as.Date(df$Month_Yr)
df
## Month_Yr Sales Customers Open Promo StateHoliday
## 1 2013-01-01 180132207 20380423 28869 11150 2
## 2 2013-02-01 171534275 19244468 26683 11150 0
## 3 2013-03-01 201180369 21969462 27892 16725 1
## 4 2013-04-01 183431432 20882365 27880 13380 1
## 5 2013-05-01 185411063 20723886 26202 14495 4
## 6 2013-06-01 180702351 20473046 27942 11150 0
## 7 2013-07-01 208843882 22872045 30166 14495 0
## 8 2013-08-01 198042727 22314232 30025 13380 1
## 9 2013-09-01 178053963 20350031 27981 11150 0
## 10 2013-10-01 187662330 21371258 28990 11150 2
## 11 2013-11-01 196170924 21543868 28412 11150 2
## 12 2013-12-01 231710561 23879341 26901 11150 2
## 13 2014-01-01 187752787 20893187 28711 11150 2
## 14 2014-02-01 178924677 19972290 26793 11150 0
## 15 2014-03-01 193019854 21486733 29008 12265 0
## 16 2014-04-01 194544977 21345653 26920 13380 2
## 17 2014-05-01 194693257 21505750 28021 13380 2
## 18 2014-06-01 190047051 20579912 26211 12265 2
## 19 2014-07-01 173892707 19299298 25233 12155 0
## 20 2014-08-01 163748475 18516451 24388 10285 0
## 21 2014-09-01 164516168 18667949 24349 11220 0
## 22 2014-10-01 164216398 18734892 24302 12155 2
## 23 2014-11-01 173327953 18686232 22989 14025 2
## 24 2014-12-01 202120592 20800624 23492 9350 2
## 25 2015-01-01 198843730 21110481 28763 16725 2
## 26 2015-02-01 178275458 19481381 26767 11150 0
## 27 2015-03-01 205631683 21809641 29080 13380 0
## 28 2015-04-01 197918886 21053991 26931 13380 2
## 29 2015-05-01 189143897 20193848 25880 12265 3
## 30 2015-06-01 207363373 21645129 28423 13380 1
## 31 2015-07-01 212322616 22253888 30188 14495 0
#denote formula
fml <- formula(Sales ~ Customers + Open + StateHoliday)
####### modeling
model_lm <- lm(fml,data = df)
summary(model_lm)
##
## Call:
## lm(formula = fml, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6146361 -3315195 -977541 3289886 8917916
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.293e+07 1.413e+07 -2.331 0.02748 *
## Customers 1.391e+01 1.017e+00 13.675 1.18e-13 ***
## Open -2.425e+03 6.832e+02 -3.550 0.00144 **
## StateHoliday -4.503e+05 8.473e+05 -0.532 0.59941
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4682000 on 27 degrees of freedom
## Multiple R-squared: 0.9183, Adjusted R-squared: 0.9092
## F-statistic: 101.1 on 3 and 27 DF, p-value: 8.49e-15
mape <- function(predict,actual){
return(mean(abs((actual-predict)/actual)) * 100)
}
#predict
y_pred_lm <- predict(model_lm,df %>% select(-Sales))
#mesuare
postResample(y_pred_lm,df$Sales)
## RMSE Rsquared MAE
## 4.369146e+06 9.182762e-01 3.677101e+06
mape(y_pred_lm,df$Sales)
## [1] 1.919857
RMSE calculation
#### desicion Tree
set.seed(987)
model_tree <- train(fml,data = df,
method = 'rpart',
metric = 'RMSE')
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
model_tree
## CART
##
## 31 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 31, 31, 31, 31, 31, 31, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.0000000 10554865 0.5877755 8382354
## 0.2941975 11812790 0.4983481 9454266
## 0.5883950 13330297 0.4693688 11095501
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.
#predict
y_pred_tree <- predict(model_tree,df %>% select(-Sales))
#mesuare
postResample(y_pred_tree,df$Sales)
## RMSE Rsquared MAE
## 9.805339e+06 5.883950e-01 7.407776e+06
mape(y_pred_tree,df$Sales)
## [1] 3.86145
fancyRpartPlot(model_tree$finalModel)
Others Tree
### randomForest
set.seed(987)
model_rf <- train(fml,data = df,
method = 'rf',
metric = 'RMSE',
ntree = 100)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
model_rf
## Random Forest
##
## 31 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 31, 31, 31, 31, 31, 31, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 9517887 0.6832149 7335288
## 3 8933300 0.7326713 6948379
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
#predict
y_pred_rf <- predict(model_rf,df %>% select(-Sales))
#mesuare
postResample(y_pred_rf,df$Sales)
## RMSE Rsquared MAE
## 3.637387e+06 9.514300e-01 2.827338e+06
mape(y_pred_rf,df$Sales)
## [1] 1.463529
### neuralnetwork
## Scale data for neural network
df2 <- df %>% select(-Month_Yr)
max = apply(df2 , 2 , max)
min = apply(df2, 2 , min)
Scaled_df = as.data.frame(scale(df2, center = min, scale = max - min))
ค่าสถิติต่างๆ normalize เป็น min = 0, max = 1
summary(Scaled_df)
## Sales Customers Open Promo
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2185 1st Qu.:0.2921 1st Qu.:0.4469 1st Qu.:0.2441
## Median :0.3737 Median :0.4412 Median :0.6794 Median :0.3953
## Mean :0.3783 Mean :0.4212 Mean :0.5903 Mean :0.4297
## 3rd Qu.:0.5105 3rd Qu.:0.5610 3rd Qu.:0.7984 3rd Qu.:0.5464
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## StateHoliday
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.2500
## Mean :0.2984
## 3rd Qu.:0.5000
## Max. :1.0000
fml
## Sales ~ Customers + Open + StateHoliday
เลือก data ของเราให้มี column เหมือนกับตัวสมการ
data <- Scaled_df %>% select(Sales,Customers,Open,StateHoliday)
# fit neural network
set.seed(2)
NN = neuralnet(fml,data = data,
hidden = 3 , linear.output = T )
plot(NN)
data
## Sales Customers Open StateHoliday
## 1 0.241071647 0.34756857 0.81678011 0.50
## 2 0.114560933 0.13575087 0.51312682 0.00
## 3 0.550776119 0.64387131 0.68106681 0.25
## 4 0.289616728 0.44116400 0.67939992 0.25
## 5 0.318745190 0.41161295 0.44631199 1.00
## 6 0.249460795 0.36483967 0.68801222 0.00
## 7 0.663537711 0.81217291 0.99694402 0.00
## 8 0.504608584 0.70815941 0.97735797 0.25
## 9 0.210492185 0.34190147 0.69342964 0.00
## 10 0.351870527 0.53232623 0.83358800 0.50
## 11 0.477066713 0.56451223 0.75329907 0.50
## 12 1.000000000 1.00000000 0.54340881 0.50
## 13 0.353201519 0.44318194 0.79483262 0.50
## 14 0.223303946 0.27146539 0.52840672 0.00
## 15 0.430701597 0.55385846 0.83608835 0.00
## 16 0.453142389 0.52755175 0.54604806 0.50
## 17 0.455324194 0.55740450 0.69898597 0.50
## 18 0.386959517 0.38476661 0.44756216 0.50
## 19 0.149263105 0.14597484 0.31170996 0.00
## 20 0.000000000 0.00000000 0.19433255 0.00
## 21 0.011295901 0.02824932 0.18891513 0.00
## 22 0.006885059 0.04073196 0.18238644 0.50
## 23 0.140953266 0.03165849 0.00000000 0.50
## 24 0.564610642 0.42592203 0.06987082 0.50
## 25 0.516394612 0.48370002 0.80205584 0.50
## 26 0.213751282 0.17992724 0.52479511 0.00
## 27 0.616273138 0.61407003 0.84608973 0.00
## 28 0.502786377 0.47316652 0.54757605 0.50
## 29 0.373670431 0.31277856 0.40158355 0.75
## 30 0.641753374 0.58339403 0.75482706 0.25
## 31 0.714724104 0.69690726 1.00000000 0.00
ได้ค่าเป็น Scale
#เลือกตาม feature เรา
pred_NN = compute(NN, Scaled_df %>% select(-Sales))
pred_NN$net.result
## [,1]
## [1,] 0.247017644
## [2,] 0.092995899
## [3,] 0.603741796
## [4,] 0.385132825
## [5,] 0.388334611
## [6,] 0.307442821
## [7,] 0.725236316
## [8,] 0.582831962
## [9,] 0.280445398
## [10,] 0.418744447
## [11,] 0.474569780
## [12,] 0.998520658
## [13,] 0.343285292
## [14,] 0.240418631
## [15,] 0.480975932
## [16,] 0.496728980
## [17,] 0.483509857
## [18,] 0.378091473
## [19,] 0.140919787
## [20,] -0.006021789
## [21,] 0.026028378
## [22,] 0.094580196
## [23,] 0.104946314
## [24,] 0.494850000
## [25,] 0.380394053
## [26,] 0.139183725
## [27,] 0.546502639
## [28,] 0.441223948
## [29,] 0.317752763
## [30,] 0.515926211
## [31,] 0.594967110
pred_NN = (pred_NN$net.result * (max(df2$Sales) - min(df2$Sales))) + min(df2$Sales)
pred_NN
## [,1]
## [1,] 180536309
## [2,] 170068670
## [3,] 204780027
## [4,] 189922905
## [5,] 190140505
## [6,] 184642930
## [7,] 213037048
## [8,] 203358951
## [9,] 182808129
## [10,] 192207221
## [11,] 196001227
## [12,] 231610022
## [13,] 187078860
## [14,] 180087827
## [15,] 196436603
## [16,] 197507213
## [17,] 196608813
## [18,] 189444360
## [19,] 173325678
## [20,] 163339222
## [21,] 165517418
## [22,] 170176342
## [23,] 170880845
## [24,] 197379513
## [25,] 189600848
## [26,] 173207691
## [27,] 200889934
## [28,] 193734975
## [29,] 185343616
## [30,] 198811896
## [31,] 204183681
ทำนาย
#predict
postResample(pred_NN,df$Sales)
## RMSE Rsquared MAE
## 4.329671e+06 9.197548e-01 3.525694e+06
#MAPE
mape(pred_NN,df$Sales)
## [1] 1.842063
the_selector <- data.frame('Date' = df$Month_Yr,
'Sales' = df$Sales,
'lm_pred' = y_pred_lm,
'tree_pred' = y_pred_tree,
'rf_pred' = y_pred_rf,
'nn_pred' = pred_NN)
the_selector
## Date Sales lm_pred tree_pred rf_pred nn_pred
## 1 2013-01-01 180132207 179643479 174705711 181731171 180536309
## 2 2013-02-01 171534275 170045096 174705711 172651065 170068670
## 3 2013-03-01 201180369 204566782 198774321 204310251 204780027
## 4 2013-04-01 183431432 189474498 198774321 188762857 189922905
## 5 2013-05-01 185411063 189988918 198774321 188742499 190140505
## 6 2013-06-01 180702351 184080876 174705711 181278386 184642930
## 7 2013-07-01 208843882 212056550 198774321 207697276 213037048
## 8 2013-08-01 198042727 204189088 198774321 203605455 203358951
## 9 2013-09-01 178053963 182275161 174705711 180043071 182808129
## 10 2013-10-01 187662330 193132401 198774321 190992043 192207221
## 11 2013-11-01 196170924 196935274 198774321 195313583 196001227
## 12 2013-12-01 231710561 233086223 198774321 220423121 231610022
## 13 2014-01-01 187752787 187159181 198774321 189320921 187078860
## 14 2014-02-01 178924677 179902218 174705711 179288102 180087827
## 15 2014-03-01 193019854 195595652 198774321 192982256 196436603
## 16 2014-04-01 194544977 197796842 198774321 195658644 197507213
## 17 2014-05-01 194693257 197353393 198774321 195247498 196608813
## 18 2014-06-01 190047051 188865093 198774321 186871684 189444360
## 19 2014-07-01 173892707 174324622 174705711 172358122 173325678
## 20 2014-08-01 163748475 165484787 174705711 165990512 163339222
## 21 2014-09-01 164516168 167686697 174705711 166595246 165517418
## 22 2014-10-01 164216398 167831196 174705711 168120816 170176342
## 23 2014-11-01 173327953 170338907 174705711 169256010 170880845
## 24 2014-12-01 202120592 198529866 198774321 196900579 197379513
## 25 2015-01-01 198843730 190055593 198774321 194344603 189600848
## 26 2015-02-01 178275458 173136793 174705711 176615594 173207691
## 27 2015-03-01 205631683 199912634 198774321 204618992 200889934
## 28 2015-04-01 197918886 193713179 198774321 197196735 193734975
## 29 2015-05-01 189143897 183847468 174705711 184246453 185343616
## 30 2015-06-01 207363373 198767455 198774321 202082373 198811896
## 31 2015-07-01 212322616 203404700 198774321 207869405 204183681
plot_data <- the_selector %>% gather('Type','Sales',Sales,lm_pred,tree_pred,rf_pred,nn_pred)
plot_data %>% ggplot(aes(x=Date,y=Sales,colour = Type)) + geom_line()
#filter only 2 model LM&RF
plot_data2 <- plot_data %>% filter(Type != 'tree_pred',
Type != 'nn_pred')
plot_data2 %>% ggplot(aes(x=Date,y=Sales,colour = Type,alpha = Type)) + geom_line(size = 1.5) +
scale_alpha_manual(values = c(0.45,0.6,1))