LOAD DATA

library(readxl)
data = read_excel("C:/Users/LENOVO/Downloads/FOLDER NADIA/SEMESTER 6/PROYEK DATA SCIENCE/DRAFT PROYEK/Data.xlsx")
data
## # A tibble: 108 × 2
##    Bulan     `Jumlah Pengunjung`
##    <chr>                   <dbl>
##  1 Januari                 47237
##  2 Februari                18009
##  3 Maret                   17462
##  4 April                   17974
##  5 Mei                     24137
##  6 Juni                    28620
##  7 Juli                    66517
##  8 Agustus                 57905
##  9 September               15975
## 10 Oktober                 21374
## # ℹ 98 more rows

Exploratory Data Analysis (EDA)

#Mengecek dimensi data
dim(data)
## [1] 108   2
#Mengecek variabel data
names(data)
## [1] "Bulan"             "Jumlah Pengunjung"
#Mengecek tipe data tiap variabel
str(data)
## tibble [108 × 2] (S3: tbl_df/tbl/data.frame)
##  $ Bulan            : chr [1:108] "Januari" "Februari" "Maret" "April" ...
##  $ Jumlah Pengunjung: num [1:108] 47237 18009 17462 17974 24137 ...
#Mengecek missing value
is.na(data)
##        Bulan Jumlah Pengunjung
##   [1,] FALSE             FALSE
##   [2,] FALSE             FALSE
##   [3,] FALSE             FALSE
##   [4,] FALSE             FALSE
##   [5,] FALSE             FALSE
##   [6,] FALSE             FALSE
##   [7,] FALSE             FALSE
##   [8,] FALSE             FALSE
##   [9,] FALSE             FALSE
##  [10,] FALSE             FALSE
##  [11,] FALSE             FALSE
##  [12,] FALSE             FALSE
##  [13,] FALSE             FALSE
##  [14,] FALSE             FALSE
##  [15,] FALSE             FALSE
##  [16,] FALSE             FALSE
##  [17,] FALSE             FALSE
##  [18,] FALSE             FALSE
##  [19,] FALSE             FALSE
##  [20,] FALSE             FALSE
##  [21,] FALSE             FALSE
##  [22,] FALSE             FALSE
##  [23,] FALSE             FALSE
##  [24,] FALSE             FALSE
##  [25,] FALSE             FALSE
##  [26,] FALSE             FALSE
##  [27,] FALSE             FALSE
##  [28,] FALSE             FALSE
##  [29,] FALSE             FALSE
##  [30,] FALSE             FALSE
##  [31,] FALSE             FALSE
##  [32,] FALSE             FALSE
##  [33,] FALSE             FALSE
##  [34,] FALSE             FALSE
##  [35,] FALSE             FALSE
##  [36,] FALSE             FALSE
##  [37,] FALSE             FALSE
##  [38,] FALSE             FALSE
##  [39,] FALSE             FALSE
##  [40,] FALSE             FALSE
##  [41,] FALSE             FALSE
##  [42,] FALSE             FALSE
##  [43,] FALSE             FALSE
##  [44,] FALSE             FALSE
##  [45,] FALSE             FALSE
##  [46,] FALSE             FALSE
##  [47,] FALSE             FALSE
##  [48,] FALSE             FALSE
##  [49,] FALSE             FALSE
##  [50,] FALSE             FALSE
##  [51,] FALSE             FALSE
##  [52,] FALSE             FALSE
##  [53,] FALSE             FALSE
##  [54,] FALSE             FALSE
##  [55,] FALSE             FALSE
##  [56,] FALSE             FALSE
##  [57,] FALSE             FALSE
##  [58,] FALSE             FALSE
##  [59,] FALSE             FALSE
##  [60,] FALSE             FALSE
##  [61,] FALSE             FALSE
##  [62,] FALSE             FALSE
##  [63,] FALSE             FALSE
##  [64,] FALSE             FALSE
##  [65,] FALSE             FALSE
##  [66,] FALSE             FALSE
##  [67,] FALSE             FALSE
##  [68,] FALSE             FALSE
##  [69,] FALSE             FALSE
##  [70,] FALSE             FALSE
##  [71,] FALSE             FALSE
##  [72,] FALSE             FALSE
##  [73,] FALSE             FALSE
##  [74,] FALSE             FALSE
##  [75,] FALSE             FALSE
##  [76,] FALSE             FALSE
##  [77,] FALSE             FALSE
##  [78,] FALSE             FALSE
##  [79,] FALSE             FALSE
##  [80,] FALSE             FALSE
##  [81,] FALSE             FALSE
##  [82,] FALSE             FALSE
##  [83,] FALSE             FALSE
##  [84,] FALSE             FALSE
##  [85,] FALSE             FALSE
##  [86,] FALSE             FALSE
##  [87,] FALSE             FALSE
##  [88,] FALSE             FALSE
##  [89,] FALSE             FALSE
##  [90,] FALSE             FALSE
##  [91,] FALSE             FALSE
##  [92,] FALSE             FALSE
##  [93,] FALSE             FALSE
##  [94,] FALSE             FALSE
##  [95,] FALSE             FALSE
##  [96,] FALSE             FALSE
##  [97,] FALSE             FALSE
##  [98,] FALSE             FALSE
##  [99,] FALSE             FALSE
## [100,] FALSE             FALSE
## [101,] FALSE             FALSE
## [102,] FALSE             FALSE
## [103,] FALSE             FALSE
## [104,] FALSE             FALSE
## [105,] FALSE             FALSE
## [106,] FALSE             FALSE
## [107,] FALSE             FALSE
## [108,] FALSE             FALSE
#Statistika deskriptif
library(pastecs)
stat.desc(data)
##          Bulan Jumlah Pengunjung
## nbr.val     NA      1.080000e+02
## nbr.null    NA      3.000000e+00
## nbr.na      NA      0.000000e+00
## min         NA      0.000000e+00
## max         NA      1.929000e+05
## range       NA      1.929000e+05
## sum         NA      4.260396e+06
## median      NA      3.056200e+04
## mean        NA      3.944811e+04
## SE.mean     NA      3.240505e+03
## CI.mean     NA      6.423924e+03
## var         NA      1.134094e+09
## std.dev     NA      3.367632e+04
## coef.var    NA      8.536864e-01
summary(data)
##     Bulan           Jumlah Pengunjung
##  Length:108         Min.   :     0   
##  Class :character   1st Qu.: 19474   
##  Mode  :character   Median : 30562   
##                     Mean   : 39448   
##                     3rd Qu.: 46386   
##                     Max.   :192900
var(data)
## Warning in var(data): NAs introduced by coercion
##                   Bulan Jumlah Pengunjung
## Bulan                NA                NA
## Jumlah Pengunjung    NA        1134094357
#Preprocessing data
#Untuk pemodelan hanya digunakan variabel jumlah pengunjung maka variabel bulan dihapus
df = subset(data, select = -c(Bulan))
df
## # A tibble: 108 × 1
##    `Jumlah Pengunjung`
##                  <dbl>
##  1               47237
##  2               18009
##  3               17462
##  4               17974
##  5               24137
##  6               28620
##  7               66517
##  8               57905
##  9               15975
## 10               21374
## # ℹ 98 more rows
#Mengubah dataset menjadi data time serues
datats <- ts(df)
datats
## Time Series:
## Start = 1 
## End = 108 
## Frequency = 1 
##        Jumlah Pengunjung
##   [1,]             47237
##   [2,]             18009
##   [3,]             17462
##   [4,]             17974
##   [5,]             24137
##   [6,]             28620
##   [7,]             66517
##   [8,]             57905
##   [9,]             15975
##  [10,]             21374
##  [11,]             22328
##  [12,]             46315
##  [13,]             50717
##  [14,]             22139
##  [15,]             22571
##  [16,]             24334
##  [17,]             39411
##  [18,]             24253
##  [19,]            118361
##  [20,]             34141
##  [21,]             24731
##  [22,]             25585
##  [23,]             21433
##  [24,]             53813
##  [25,]             71349
##  [26,]             27932
##  [27,]             27068
##  [28,]             24150
##  [29,]             52206
##  [30,]             13814
##  [31,]            140578
##  [32,]             26706
##  [33,]             30904
##  [34,]             30941
##  [35,]             26229
##  [36,]             66107
##  [37,]             91912
##  [38,]             30414
##  [39,]             33408
##  [40,]             45319
##  [41,]             41580
##  [42,]            106245
##  [43,]            100674
##  [44,]             30121
##  [45,]             40614
##  [46,]             27611
##  [47,]             20591
##  [48,]             64931
##  [49,]             71755
##  [50,]             34453
##  [51,]             39112
##  [52,]             48870
##  [53,]             37354
##  [54,]            176599
##  [55,]             77002
##  [56,]             37200
##  [57,]             45403
##  [58,]             33180
##  [59,]             36639
##  [60,]             78096
##  [61,]             64179
##  [62,]             35357
##  [63,]             41663
##  [64,]             51756
##  [65,]             16184
##  [66,]            192900
##  [67,]             83148
##  [68,]             33242
##  [69,]             39699
##  [70,]             40015
##  [71,]             40052
##  [72,]            108701
##  [73,]             60961
##  [74,]             30710
##  [75,]             17779
##  [76,]                 0
##  [77,]                 0
##  [78,]                 0
##  [79,]              2481
##  [80,]             29679
##  [81,]             14428
##  [82,]             17690
##  [83,]             15964
##  [84,]             10926
##  [85,]             12787
##  [86,]              5718
##  [87,]             13184
##  [88,]             10387
##  [89,]             46598
##  [90,]             24281
##  [91,]               336
##  [92,]               175
##  [93,]              5416
##  [94,]             16706
##  [95,]             15982
##  [96,]             27019
##  [97,]             39566
##  [98,]             21653
##  [99,]             22694
## [100,]              4233
## [101,]            115430
## [102,]             45257
## [103,]             41466
## [104,]             19804
## [105,]             19892
## [106,]             18482
## [107,]             15320
## [108,]             38087
str(datats)
##  Time-Series [1:108, 1] from 1 to 108: 47237 18009 17462 17974 24137 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr "Jumlah Pengunjung"

Plot data time series

plot.ts(datats)

Plot dan hasil korelasi ACF

#Menampilkan plot korelasi ACF
acf(datats)

#Menampilkan hasil korelasi ACF
print(acf(datats))

## 
## Autocorrelations of series 'datats', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.223  0.084  0.029 -0.021  0.197  0.477  0.143 -0.012 -0.006 -0.076 
##     11     12     13     14     15     16     17     18     19     20 
##  0.158  0.535  0.043 -0.005 -0.081 -0.098  0.160  0.281 -0.047 -0.140

Plot dan hasil korelasi PACF

#Menampilkan plot korelasi PACF
pacf(datats)

#Menampilkan hasil korelasi PACF
print(pacf(datats))

## 
## Partial autocorrelations of series 'datats', by lag
## 
##      1      2      3      4      5      6      7      8      9     10     11 
##  0.223  0.036  0.003 -0.033  0.218  0.435 -0.049 -0.136  0.020 -0.040  0.060 
##     12     13     14     15     16     17     18     19     20 
##  0.404 -0.197 -0.083 -0.040  0.019  0.035 -0.093 -0.142 -0.109

Differencing

#Melakukan differencing karena plot time series sebelumnya terlihat bahwa data jumlah pengunjung di lokawisata baturaden belum stasioner dalam rata-rata maupun variansi atau plot sebelumnya tidak berfluktuasi disekitar titik nol/konstan
datadiff1 <- diff(datats, differences = 1)

#Menampilkan plot time series setelah di differencing
plot.ts(datadiff1)

Plot dan hasil korelasi ACF setelah differencing

#Menampilkan plot korelasi ACF
acf(datadiff1)

#Menampilkan hasil korelasi ACF
print(acf(datadiff1))

## 
## Autocorrelations of series 'datadiff1', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000 -0.409 -0.054 -0.003 -0.173 -0.040  0.395 -0.115 -0.103  0.049 -0.196 
##     11     12     13     14     15     16     17     18     19     20 
## -0.093  0.559 -0.284  0.017 -0.038 -0.177  0.089  0.285 -0.147 -0.079

Plot dan hasil korelasi PACF setelah differencing

#Menampilkan plot korelasi PACF
pacf(datadiff1)

#Menampilkan hasil korelasi PACF
print(pacf(datadiff1))

## 
## Partial autocorrelations of series 'datadiff1', by lag
## 
##      1      2      3      4      5      6      7      8      9     10     11 
## -0.409 -0.267 -0.182 -0.359 -0.490  0.011  0.089 -0.071 -0.010 -0.106 -0.427 
##     12     13     14     15     16     17     18     19     20 
##  0.182  0.059  0.010 -0.051 -0.064  0.065  0.100  0.060 -0.094

Pemodelan ARIMA

#Model ARIMA yang mungkin
fit1 = arima(datats, order = c(1, 1, 0))
fit2 = arima(datats, order = c(0, 1, 1))
fit3 = arima(datats, order = c(1, 1, 1))
fit4 = arima(datats, order = c(2, 1, 0))
fit5 = arima(datats, order = c(2, 1, 1))
fit6 = arima(datats, order = c(2, 1, 2))
fit7 = arima(datats, order = c(1, 1, 2))
fit8 = arima(datats, order = c(3, 1, 2))
fit9 = arima(datats, order = c(4, 1, 2))
fit10 = arima(datats, order = c(4, 1, 3))
fit11 = arima(datats, order = c(5, 1, 2))
fit12 = arima(datats, order = c(5, 1, 3))
fit13 = arima(datats, order = c(5, 1, 6))
fit14 = arima(datats, order = c(6, 1, 2))
fit15 = arima(datats, order = c(6, 1, 3))
fit16 = arima(datats, order = c(6, 1, 5))
## Warning in arima(datats, order = c(6, 1, 5)): possible convergence problem:
## optim gave code = 1

Nilai AIC tiap model

fit1$aic
## [1] 2566.029
fit2$aic
## [1] 2534.225
fit3$aic
## [1] 2535.298
fit4$aic
## [1] 2560.107
fit5$aic
## [1] 2536.807
fit6$aic
## [1] 2536.404
fit7$aic
## [1] 2537.102
fit8$aic
## [1] 2536.319
fit9$aic
## [1] 2518.564
fit10$aic
## [1] 2514.167
fit11$aic
## [1] 2521.309
fit12$aic
## [1] 2517.17
fit13$aic
## [1] 2504.249
fit14$aic
## [1] 2518.242
fit15$aic
## [1] 2522.818
fit16$aic
## [1] 2504.005

Nilai BIC tiap model

BIC(fit1)
## [1] 2571.374
BIC(fit2)
## [1] 2539.57
BIC(fit3)
## [1] 2543.317
BIC(fit4)
## [1] 2568.126
BIC(fit5)
## [1] 2547.499
BIC(fit6)
## [1] 2549.768
BIC(fit7)
## [1] 2547.793
BIC(fit8)
## [1] 2552.356
BIC(fit9)
## [1] 2537.274
BIC(fit10)
## [1] 2535.55
BIC(fit11)
## [1] 2542.692
BIC(fit12)
## [1] 2541.225
BIC(fit13)
## [1] 2536.323
BIC(fit14)
## [1] 2542.297
BIC(fit15)
## [1] 2549.546
BIC(fit16)
## [1] 2536.079

Verifikasi Model

#Pemodelan terbaik, yaitu pada fit16 karena nilai AIC = 2.504,005 dan nilai BIC = 2.536,079 dan karena dari plot histogram dan p-value > 0.05
tsdiag(fit16)

Peramalan

library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
fit16
## 
## Call:
## arima(x = datats, order = c(6, 1, 5))
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5     ar6      ma1    ma2
##       -0.4096  -0.7823  -0.3757  -0.8494  -0.2165  0.1333  -0.3816  0.540
## s.e.   0.2092   0.2141   0.2010   0.1802   0.2124  0.1824   0.1837  0.084
##           ma3     ma4      ma5
##       -0.4156  0.6523  -0.7300
## s.e.   0.1125  0.1051   0.1227
## 
## sigma^2 estimated as 629065482:  log likelihood = -1240,  aic = 2504
fit16$var.coef
##              ar1          ar2          ar3          ar4          ar5
## ar1  0.043763159  0.038322295  0.041273108  0.031707835  0.043646026
## ar2  0.038322295  0.045851898  0.037658873  0.037657017  0.039687939
## ar3  0.041273108  0.037658873  0.040389799  0.031428789  0.041956811
## ar4  0.031707835  0.037657017  0.031428789  0.032456965  0.033275640
## ar5  0.043646026  0.039687939  0.041956811  0.033275640  0.045127257
## ar6  0.031278069  0.038331372  0.030976939  0.032082437  0.032713375
## ma1 -0.033430815 -0.032509555 -0.031508844 -0.026216022 -0.033483107
## ma2 -0.009980524 -0.012666299 -0.009497476 -0.010005227 -0.009822274
## ma3 -0.015607428 -0.014875770 -0.016116227 -0.012899224 -0.016324796
## ma4  0.008012125  0.007421267  0.006577253  0.004564824  0.006868174
## ma5 -0.017172535 -0.015974027 -0.016479799 -0.014295275 -0.018385903
##              ar6          ma1          ma2          ma3          ma4
## ar1  0.031278069 -0.033430815 -0.009980524 -0.015607428  0.008012125
## ar2  0.038331372 -0.032509555 -0.012666299 -0.014875770  0.007421267
## ar3  0.030976939 -0.031508844 -0.009497476 -0.016116227  0.006577253
## ar4  0.032082437 -0.026216022 -0.010005227 -0.012899224  0.004564824
## ar5  0.032713375 -0.033483107 -0.009822274 -0.016324796  0.006868174
## ar6  0.033269241 -0.027290038 -0.009853798 -0.012503664  0.006272624
## ma1 -0.027290038  0.033747847  0.009072081  0.013461513 -0.005552867
## ma2 -0.009853798  0.009072081  0.007051905  0.004110697 -0.001523462
## ma3 -0.012503664  0.013461513  0.004110697  0.012647204 -0.003000927
## ma4  0.006272624 -0.005552867 -0.001523462 -0.003000927  0.011038802
## ma5 -0.013719438  0.015058823  0.003645952  0.010965617 -0.003304247
##              ma5
## ar1 -0.017172535
## ar2 -0.015974027
## ar3 -0.016479799
## ar4 -0.014295275
## ar5 -0.018385903
## ar6 -0.013719438
## ma1  0.015058823
## ma2  0.003645952
## ma3  0.010965617
## ma4 -0.003304247
## ma5  0.015047382
mean(datadiff1)
## [1] -85.51402
mean(datats)
## [1] 39448.11
dt <- forecast(fit16)
plot(dt)

Box.test(dt$residuals)
## 
##  Box-Pierce test
## 
## data:  dt$residuals
## X-squared = 0.014337, df = 1, p-value = 0.9047
hist(dt$residuals)

checkresiduals(fit16)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(6,1,5)
## Q* = 9.8178, df = 3, p-value = 0.02018
## 
## Model df: 11.   Total lags used: 14
predict(fit16, n.ahead = 12)
## $pred
## Time Series:
## Start = 109 
## End = 120 
## Frequency = 1 
##  [1] 27252.89 18614.28 19152.11 30391.13 31222.43 34604.51 28316.12 17119.66
##  [9] 22287.15 29737.84 31569.23 34370.94
## 
## $se
## Time Series:
## Start = 109 
## End = 120 
## Frequency = 1 
##  [1] 25297.62 25788.78 26771.96 26880.62 27312.02 27313.49 28461.96 29660.08
##  [9] 29830.33 30214.68 30347.92 30399.88