library(readxl)
data = read_excel("C:/Users/LENOVO/Downloads/FOLDER NADIA/SEMESTER 6/PROYEK DATA SCIENCE/DRAFT PROYEK/Data.xlsx")
data
## # A tibble: 108 × 2
## Bulan `Jumlah Pengunjung`
## <chr> <dbl>
## 1 Januari 47237
## 2 Februari 18009
## 3 Maret 17462
## 4 April 17974
## 5 Mei 24137
## 6 Juni 28620
## 7 Juli 66517
## 8 Agustus 57905
## 9 September 15975
## 10 Oktober 21374
## # ℹ 98 more rows
#Mengecek dimensi data
dim(data)
## [1] 108 2
#Mengecek variabel data
names(data)
## [1] "Bulan" "Jumlah Pengunjung"
#Mengecek tipe data tiap variabel
str(data)
## tibble [108 × 2] (S3: tbl_df/tbl/data.frame)
## $ Bulan : chr [1:108] "Januari" "Februari" "Maret" "April" ...
## $ Jumlah Pengunjung: num [1:108] 47237 18009 17462 17974 24137 ...
#Mengecek missing value
is.na(data)
## Bulan Jumlah Pengunjung
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,] FALSE FALSE
## [4,] FALSE FALSE
## [5,] FALSE FALSE
## [6,] FALSE FALSE
## [7,] FALSE FALSE
## [8,] FALSE FALSE
## [9,] FALSE FALSE
## [10,] FALSE FALSE
## [11,] FALSE FALSE
## [12,] FALSE FALSE
## [13,] FALSE FALSE
## [14,] FALSE FALSE
## [15,] FALSE FALSE
## [16,] FALSE FALSE
## [17,] FALSE FALSE
## [18,] FALSE FALSE
## [19,] FALSE FALSE
## [20,] FALSE FALSE
## [21,] FALSE FALSE
## [22,] FALSE FALSE
## [23,] FALSE FALSE
## [24,] FALSE FALSE
## [25,] FALSE FALSE
## [26,] FALSE FALSE
## [27,] FALSE FALSE
## [28,] FALSE FALSE
## [29,] FALSE FALSE
## [30,] FALSE FALSE
## [31,] FALSE FALSE
## [32,] FALSE FALSE
## [33,] FALSE FALSE
## [34,] FALSE FALSE
## [35,] FALSE FALSE
## [36,] FALSE FALSE
## [37,] FALSE FALSE
## [38,] FALSE FALSE
## [39,] FALSE FALSE
## [40,] FALSE FALSE
## [41,] FALSE FALSE
## [42,] FALSE FALSE
## [43,] FALSE FALSE
## [44,] FALSE FALSE
## [45,] FALSE FALSE
## [46,] FALSE FALSE
## [47,] FALSE FALSE
## [48,] FALSE FALSE
## [49,] FALSE FALSE
## [50,] FALSE FALSE
## [51,] FALSE FALSE
## [52,] FALSE FALSE
## [53,] FALSE FALSE
## [54,] FALSE FALSE
## [55,] FALSE FALSE
## [56,] FALSE FALSE
## [57,] FALSE FALSE
## [58,] FALSE FALSE
## [59,] FALSE FALSE
## [60,] FALSE FALSE
## [61,] FALSE FALSE
## [62,] FALSE FALSE
## [63,] FALSE FALSE
## [64,] FALSE FALSE
## [65,] FALSE FALSE
## [66,] FALSE FALSE
## [67,] FALSE FALSE
## [68,] FALSE FALSE
## [69,] FALSE FALSE
## [70,] FALSE FALSE
## [71,] FALSE FALSE
## [72,] FALSE FALSE
## [73,] FALSE FALSE
## [74,] FALSE FALSE
## [75,] FALSE FALSE
## [76,] FALSE FALSE
## [77,] FALSE FALSE
## [78,] FALSE FALSE
## [79,] FALSE FALSE
## [80,] FALSE FALSE
## [81,] FALSE FALSE
## [82,] FALSE FALSE
## [83,] FALSE FALSE
## [84,] FALSE FALSE
## [85,] FALSE FALSE
## [86,] FALSE FALSE
## [87,] FALSE FALSE
## [88,] FALSE FALSE
## [89,] FALSE FALSE
## [90,] FALSE FALSE
## [91,] FALSE FALSE
## [92,] FALSE FALSE
## [93,] FALSE FALSE
## [94,] FALSE FALSE
## [95,] FALSE FALSE
## [96,] FALSE FALSE
## [97,] FALSE FALSE
## [98,] FALSE FALSE
## [99,] FALSE FALSE
## [100,] FALSE FALSE
## [101,] FALSE FALSE
## [102,] FALSE FALSE
## [103,] FALSE FALSE
## [104,] FALSE FALSE
## [105,] FALSE FALSE
## [106,] FALSE FALSE
## [107,] FALSE FALSE
## [108,] FALSE FALSE
#Statistika deskriptif
library(pastecs)
stat.desc(data)
## Bulan Jumlah Pengunjung
## nbr.val NA 1.080000e+02
## nbr.null NA 3.000000e+00
## nbr.na NA 0.000000e+00
## min NA 0.000000e+00
## max NA 1.929000e+05
## range NA 1.929000e+05
## sum NA 4.260396e+06
## median NA 3.056200e+04
## mean NA 3.944811e+04
## SE.mean NA 3.240505e+03
## CI.mean NA 6.423924e+03
## var NA 1.134094e+09
## std.dev NA 3.367632e+04
## coef.var NA 8.536864e-01
summary(data)
## Bulan Jumlah Pengunjung
## Length:108 Min. : 0
## Class :character 1st Qu.: 19474
## Mode :character Median : 30562
## Mean : 39448
## 3rd Qu.: 46386
## Max. :192900
var(data)
## Warning in var(data): NAs introduced by coercion
## Bulan Jumlah Pengunjung
## Bulan NA NA
## Jumlah Pengunjung NA 1134094357
#Preprocessing data
#Untuk pemodelan hanya digunakan variabel jumlah pengunjung maka variabel bulan dihapus
df = subset(data, select = -c(Bulan))
df
## # A tibble: 108 × 1
## `Jumlah Pengunjung`
## <dbl>
## 1 47237
## 2 18009
## 3 17462
## 4 17974
## 5 24137
## 6 28620
## 7 66517
## 8 57905
## 9 15975
## 10 21374
## # ℹ 98 more rows
#Mengubah dataset menjadi data time serues
datats <- ts(df)
datats
## Time Series:
## Start = 1
## End = 108
## Frequency = 1
## Jumlah Pengunjung
## [1,] 47237
## [2,] 18009
## [3,] 17462
## [4,] 17974
## [5,] 24137
## [6,] 28620
## [7,] 66517
## [8,] 57905
## [9,] 15975
## [10,] 21374
## [11,] 22328
## [12,] 46315
## [13,] 50717
## [14,] 22139
## [15,] 22571
## [16,] 24334
## [17,] 39411
## [18,] 24253
## [19,] 118361
## [20,] 34141
## [21,] 24731
## [22,] 25585
## [23,] 21433
## [24,] 53813
## [25,] 71349
## [26,] 27932
## [27,] 27068
## [28,] 24150
## [29,] 52206
## [30,] 13814
## [31,] 140578
## [32,] 26706
## [33,] 30904
## [34,] 30941
## [35,] 26229
## [36,] 66107
## [37,] 91912
## [38,] 30414
## [39,] 33408
## [40,] 45319
## [41,] 41580
## [42,] 106245
## [43,] 100674
## [44,] 30121
## [45,] 40614
## [46,] 27611
## [47,] 20591
## [48,] 64931
## [49,] 71755
## [50,] 34453
## [51,] 39112
## [52,] 48870
## [53,] 37354
## [54,] 176599
## [55,] 77002
## [56,] 37200
## [57,] 45403
## [58,] 33180
## [59,] 36639
## [60,] 78096
## [61,] 64179
## [62,] 35357
## [63,] 41663
## [64,] 51756
## [65,] 16184
## [66,] 192900
## [67,] 83148
## [68,] 33242
## [69,] 39699
## [70,] 40015
## [71,] 40052
## [72,] 108701
## [73,] 60961
## [74,] 30710
## [75,] 17779
## [76,] 0
## [77,] 0
## [78,] 0
## [79,] 2481
## [80,] 29679
## [81,] 14428
## [82,] 17690
## [83,] 15964
## [84,] 10926
## [85,] 12787
## [86,] 5718
## [87,] 13184
## [88,] 10387
## [89,] 46598
## [90,] 24281
## [91,] 336
## [92,] 175
## [93,] 5416
## [94,] 16706
## [95,] 15982
## [96,] 27019
## [97,] 39566
## [98,] 21653
## [99,] 22694
## [100,] 4233
## [101,] 115430
## [102,] 45257
## [103,] 41466
## [104,] 19804
## [105,] 19892
## [106,] 18482
## [107,] 15320
## [108,] 38087
str(datats)
## Time-Series [1:108, 1] from 1 to 108: 47237 18009 17462 17974 24137 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "Jumlah Pengunjung"
plot.ts(datats)
#Menampilkan plot korelasi ACF
acf(datats)
#Menampilkan hasil korelasi ACF
print(acf(datats))
##
## Autocorrelations of series 'datats', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 0.223 0.084 0.029 -0.021 0.197 0.477 0.143 -0.012 -0.006 -0.076
## 11 12 13 14 15 16 17 18 19 20
## 0.158 0.535 0.043 -0.005 -0.081 -0.098 0.160 0.281 -0.047 -0.140
#Menampilkan plot korelasi PACF
pacf(datats)
#Menampilkan hasil korelasi PACF
print(pacf(datats))
##
## Partial autocorrelations of series 'datats', by lag
##
## 1 2 3 4 5 6 7 8 9 10 11
## 0.223 0.036 0.003 -0.033 0.218 0.435 -0.049 -0.136 0.020 -0.040 0.060
## 12 13 14 15 16 17 18 19 20
## 0.404 -0.197 -0.083 -0.040 0.019 0.035 -0.093 -0.142 -0.109
#Melakukan differencing karena plot time series sebelumnya terlihat bahwa data jumlah pengunjung di lokawisata baturaden belum stasioner dalam rata-rata maupun variansi atau plot sebelumnya tidak berfluktuasi disekitar titik nol/konstan
datadiff1 <- diff(datats, differences = 1)
#Menampilkan plot time series setelah di differencing
plot.ts(datadiff1)
#Menampilkan plot korelasi ACF
acf(datadiff1)
#Menampilkan hasil korelasi ACF
print(acf(datadiff1))
##
## Autocorrelations of series 'datadiff1', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 -0.409 -0.054 -0.003 -0.173 -0.040 0.395 -0.115 -0.103 0.049 -0.196
## 11 12 13 14 15 16 17 18 19 20
## -0.093 0.559 -0.284 0.017 -0.038 -0.177 0.089 0.285 -0.147 -0.079
#Menampilkan plot korelasi PACF
pacf(datadiff1)
#Menampilkan hasil korelasi PACF
print(pacf(datadiff1))
##
## Partial autocorrelations of series 'datadiff1', by lag
##
## 1 2 3 4 5 6 7 8 9 10 11
## -0.409 -0.267 -0.182 -0.359 -0.490 0.011 0.089 -0.071 -0.010 -0.106 -0.427
## 12 13 14 15 16 17 18 19 20
## 0.182 0.059 0.010 -0.051 -0.064 0.065 0.100 0.060 -0.094
#Model ARIMA yang mungkin
fit1 = arima(datats, order = c(1, 1, 0))
fit2 = arima(datats, order = c(0, 1, 1))
fit3 = arima(datats, order = c(1, 1, 1))
fit4 = arima(datats, order = c(2, 1, 0))
fit5 = arima(datats, order = c(2, 1, 1))
fit6 = arima(datats, order = c(2, 1, 2))
fit7 = arima(datats, order = c(1, 1, 2))
fit8 = arima(datats, order = c(3, 1, 2))
fit9 = arima(datats, order = c(4, 1, 2))
fit10 = arima(datats, order = c(4, 1, 3))
fit11 = arima(datats, order = c(5, 1, 2))
fit12 = arima(datats, order = c(5, 1, 3))
fit13 = arima(datats, order = c(5, 1, 6))
fit14 = arima(datats, order = c(6, 1, 2))
fit15 = arima(datats, order = c(6, 1, 3))
fit16 = arima(datats, order = c(6, 1, 5))
## Warning in arima(datats, order = c(6, 1, 5)): possible convergence problem:
## optim gave code = 1
fit1$aic
## [1] 2566.029
fit2$aic
## [1] 2534.225
fit3$aic
## [1] 2535.298
fit4$aic
## [1] 2560.107
fit5$aic
## [1] 2536.807
fit6$aic
## [1] 2536.404
fit7$aic
## [1] 2537.102
fit8$aic
## [1] 2536.319
fit9$aic
## [1] 2518.564
fit10$aic
## [1] 2514.167
fit11$aic
## [1] 2521.309
fit12$aic
## [1] 2517.17
fit13$aic
## [1] 2504.249
fit14$aic
## [1] 2518.242
fit15$aic
## [1] 2522.818
fit16$aic
## [1] 2504.005
BIC(fit1)
## [1] 2571.374
BIC(fit2)
## [1] 2539.57
BIC(fit3)
## [1] 2543.317
BIC(fit4)
## [1] 2568.126
BIC(fit5)
## [1] 2547.499
BIC(fit6)
## [1] 2549.768
BIC(fit7)
## [1] 2547.793
BIC(fit8)
## [1] 2552.356
BIC(fit9)
## [1] 2537.274
BIC(fit10)
## [1] 2535.55
BIC(fit11)
## [1] 2542.692
BIC(fit12)
## [1] 2541.225
BIC(fit13)
## [1] 2536.323
BIC(fit14)
## [1] 2542.297
BIC(fit15)
## [1] 2549.546
BIC(fit16)
## [1] 2536.079
#Pemodelan terbaik, yaitu pada fit16 karena nilai AIC = 2.504,005 dan nilai BIC = 2.536,079 dan karena dari plot histogram dan p-value > 0.05
tsdiag(fit16)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
fit16
##
## Call:
## arima(x = datats, order = c(6, 1, 5))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ma1 ma2
## -0.4096 -0.7823 -0.3757 -0.8494 -0.2165 0.1333 -0.3816 0.540
## s.e. 0.2092 0.2141 0.2010 0.1802 0.2124 0.1824 0.1837 0.084
## ma3 ma4 ma5
## -0.4156 0.6523 -0.7300
## s.e. 0.1125 0.1051 0.1227
##
## sigma^2 estimated as 629065482: log likelihood = -1240, aic = 2504
fit16$var.coef
## ar1 ar2 ar3 ar4 ar5
## ar1 0.043763159 0.038322295 0.041273108 0.031707835 0.043646026
## ar2 0.038322295 0.045851898 0.037658873 0.037657017 0.039687939
## ar3 0.041273108 0.037658873 0.040389799 0.031428789 0.041956811
## ar4 0.031707835 0.037657017 0.031428789 0.032456965 0.033275640
## ar5 0.043646026 0.039687939 0.041956811 0.033275640 0.045127257
## ar6 0.031278069 0.038331372 0.030976939 0.032082437 0.032713375
## ma1 -0.033430815 -0.032509555 -0.031508844 -0.026216022 -0.033483107
## ma2 -0.009980524 -0.012666299 -0.009497476 -0.010005227 -0.009822274
## ma3 -0.015607428 -0.014875770 -0.016116227 -0.012899224 -0.016324796
## ma4 0.008012125 0.007421267 0.006577253 0.004564824 0.006868174
## ma5 -0.017172535 -0.015974027 -0.016479799 -0.014295275 -0.018385903
## ar6 ma1 ma2 ma3 ma4
## ar1 0.031278069 -0.033430815 -0.009980524 -0.015607428 0.008012125
## ar2 0.038331372 -0.032509555 -0.012666299 -0.014875770 0.007421267
## ar3 0.030976939 -0.031508844 -0.009497476 -0.016116227 0.006577253
## ar4 0.032082437 -0.026216022 -0.010005227 -0.012899224 0.004564824
## ar5 0.032713375 -0.033483107 -0.009822274 -0.016324796 0.006868174
## ar6 0.033269241 -0.027290038 -0.009853798 -0.012503664 0.006272624
## ma1 -0.027290038 0.033747847 0.009072081 0.013461513 -0.005552867
## ma2 -0.009853798 0.009072081 0.007051905 0.004110697 -0.001523462
## ma3 -0.012503664 0.013461513 0.004110697 0.012647204 -0.003000927
## ma4 0.006272624 -0.005552867 -0.001523462 -0.003000927 0.011038802
## ma5 -0.013719438 0.015058823 0.003645952 0.010965617 -0.003304247
## ma5
## ar1 -0.017172535
## ar2 -0.015974027
## ar3 -0.016479799
## ar4 -0.014295275
## ar5 -0.018385903
## ar6 -0.013719438
## ma1 0.015058823
## ma2 0.003645952
## ma3 0.010965617
## ma4 -0.003304247
## ma5 0.015047382
mean(datadiff1)
## [1] -85.51402
mean(datats)
## [1] 39448.11
dt <- forecast(fit16)
plot(dt)
Box.test(dt$residuals)
##
## Box-Pierce test
##
## data: dt$residuals
## X-squared = 0.014337, df = 1, p-value = 0.9047
hist(dt$residuals)
checkresiduals(fit16)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(6,1,5)
## Q* = 9.8178, df = 3, p-value = 0.02018
##
## Model df: 11. Total lags used: 14
predict(fit16, n.ahead = 12)
## $pred
## Time Series:
## Start = 109
## End = 120
## Frequency = 1
## [1] 27252.89 18614.28 19152.11 30391.13 31222.43 34604.51 28316.12 17119.66
## [9] 22287.15 29737.84 31569.23 34370.94
##
## $se
## Time Series:
## Start = 109
## End = 120
## Frequency = 1
## [1] 25297.62 25788.78 26771.96 26880.62 27312.02 27313.49 28461.96 29660.08
## [9] 29830.33 30214.68 30347.92 30399.88