LOAD DATA

library(readxl)
data = read_excel("C:/Users/LENOVO/Downloads/FOLDER NADIA/SEMESTER 6/PROYEK DATA SCIENCE/DRAFT PROYEK/Data.xlsx")
data

## # A tibble: 108 × 2
##    Bulan     `Jumlah Pengunjung`
##    <chr>                   <dbl>
##  1 Januari                 47237
##  2 Februari                18009
##  3 Maret                   17462
##  4 April                   17974
##  5 Mei                     24137
##  6 Juni                    28620
##  7 Juli                    66517
##  8 Agustus                 57905
##  9 September               15975
## 10 Oktober                 21374
## # ℹ 98 more rows

Exploratory Data Analysis (EDA)

#Mengecek dimensi data
dim(data)

## [1] 108   2

#Mengecek variabel data
names(data)

## [1] "Bulan"             "Jumlah Pengunjung"

#Mengecek tipe data tiap variabel
str(data)

## tibble [108 × 2] (S3: tbl_df/tbl/data.frame)
##  $ Bulan            : chr [1:108] "Januari" "Februari" "Maret" "April" ...
##  $ Jumlah Pengunjung: num [1:108] 47237 18009 17462 17974 24137 ...

#Mengecek missing value
is.na(data)

##        Bulan Jumlah Pengunjung
##   [1,] FALSE             FALSE
##   [2,] FALSE             FALSE
##   [3,] FALSE             FALSE
##   [4,] FALSE             FALSE
##   [5,] FALSE             FALSE
##   [6,] FALSE             FALSE
##   [7,] FALSE             FALSE
##   [8,] FALSE             FALSE
##   [9,] FALSE             FALSE
##  [10,] FALSE             FALSE
##  [11,] FALSE             FALSE
##  [12,] FALSE             FALSE
##  [13,] FALSE             FALSE
##  [14,] FALSE             FALSE
##  [15,] FALSE             FALSE
##  [16,] FALSE             FALSE
##  [17,] FALSE             FALSE
##  [18,] FALSE             FALSE
##  [19,] FALSE             FALSE
##  [20,] FALSE             FALSE
##  [21,] FALSE             FALSE
##  [22,] FALSE             FALSE
##  [23,] FALSE             FALSE
##  [24,] FALSE             FALSE
##  [25,] FALSE             FALSE
##  [26,] FALSE             FALSE
##  [27,] FALSE             FALSE
##  [28,] FALSE             FALSE
##  [29,] FALSE             FALSE
##  [30,] FALSE             FALSE
##  [31,] FALSE             FALSE
##  [32,] FALSE             FALSE
##  [33,] FALSE             FALSE
##  [34,] FALSE             FALSE
##  [35,] FALSE             FALSE
##  [36,] FALSE             FALSE
##  [37,] FALSE             FALSE
##  [38,] FALSE             FALSE
##  [39,] FALSE             FALSE
##  [40,] FALSE             FALSE
##  [41,] FALSE             FALSE
##  [42,] FALSE             FALSE
##  [43,] FALSE             FALSE
##  [44,] FALSE             FALSE
##  [45,] FALSE             FALSE
##  [46,] FALSE             FALSE
##  [47,] FALSE             FALSE
##  [48,] FALSE             FALSE
##  [49,] FALSE             FALSE
##  [50,] FALSE             FALSE
##  [51,] FALSE             FALSE
##  [52,] FALSE             FALSE
##  [53,] FALSE             FALSE
##  [54,] FALSE             FALSE
##  [55,] FALSE             FALSE
##  [56,] FALSE             FALSE
##  [57,] FALSE             FALSE
##  [58,] FALSE             FALSE
##  [59,] FALSE             FALSE
##  [60,] FALSE             FALSE
##  [61,] FALSE             FALSE
##  [62,] FALSE             FALSE
##  [63,] FALSE             FALSE
##  [64,] FALSE             FALSE
##  [65,] FALSE             FALSE
##  [66,] FALSE             FALSE
##  [67,] FALSE             FALSE
##  [68,] FALSE             FALSE
##  [69,] FALSE             FALSE
##  [70,] FALSE             FALSE
##  [71,] FALSE             FALSE
##  [72,] FALSE             FALSE
##  [73,] FALSE             FALSE
##  [74,] FALSE             FALSE
##  [75,] FALSE             FALSE
##  [76,] FALSE             FALSE
##  [77,] FALSE             FALSE
##  [78,] FALSE             FALSE
##  [79,] FALSE             FALSE
##  [80,] FALSE             FALSE
##  [81,] FALSE             FALSE
##  [82,] FALSE             FALSE
##  [83,] FALSE             FALSE
##  [84,] FALSE             FALSE
##  [85,] FALSE             FALSE
##  [86,] FALSE             FALSE
##  [87,] FALSE             FALSE
##  [88,] FALSE             FALSE
##  [89,] FALSE             FALSE
##  [90,] FALSE             FALSE
##  [91,] FALSE             FALSE
##  [92,] FALSE             FALSE
##  [93,] FALSE             FALSE
##  [94,] FALSE             FALSE
##  [95,] FALSE             FALSE
##  [96,] FALSE             FALSE
##  [97,] FALSE             FALSE
##  [98,] FALSE             FALSE
##  [99,] FALSE             FALSE
## [100,] FALSE             FALSE
## [101,] FALSE             FALSE
## [102,] FALSE             FALSE
## [103,] FALSE             FALSE
## [104,] FALSE             FALSE
## [105,] FALSE             FALSE
## [106,] FALSE             FALSE
## [107,] FALSE             FALSE
## [108,] FALSE             FALSE

#Statistika deskriptif
library(pastecs)
stat.desc(data)

##          Bulan Jumlah Pengunjung
## nbr.val     NA      1.080000e+02
## nbr.null    NA      3.000000e+00
## nbr.na      NA      0.000000e+00
## min         NA      0.000000e+00
## max         NA      1.929000e+05
## range       NA      1.929000e+05
## sum         NA      4.260396e+06
## median      NA      3.056200e+04
## mean        NA      3.944811e+04
## SE.mean     NA      3.240505e+03
## CI.mean     NA      6.423924e+03
## var         NA      1.134094e+09
## std.dev     NA      3.367632e+04
## coef.var    NA      8.536864e-01

summary(data)

##     Bulan           Jumlah Pengunjung
##  Length:108         Min.   :     0   
##  Class :character   1st Qu.: 19474   
##  Mode  :character   Median : 30562   
##                     Mean   : 39448   
##                     3rd Qu.: 46386   
##                     Max.   :192900

var(data)

## Warning in var(data): NAs introduced by coercion

##                   Bulan Jumlah Pengunjung
## Bulan                NA                NA
## Jumlah Pengunjung    NA        1134094357

#Preprocessing data
#Untuk pemodelan hanya digunakan variabel jumlah pengunjung maka variabel bulan dihapus
df = subset(data, select = -c(Bulan))
df

## # A tibble: 108 × 1
##    `Jumlah Pengunjung`
##                  <dbl>
##  1               47237
##  2               18009
##  3               17462
##  4               17974
##  5               24137
##  6               28620
##  7               66517
##  8               57905
##  9               15975
## 10               21374
## # ℹ 98 more rows

#Mengubah dataset menjadi data time serues
datats <- ts(df)
datats

## Time Series:
## Start = 1 
## End = 108 
## Frequency = 1 
##        Jumlah Pengunjung
##   [1,]             47237
##   [2,]             18009
##   [3,]             17462
##   [4,]             17974
##   [5,]             24137
##   [6,]             28620
##   [7,]             66517
##   [8,]             57905
##   [9,]             15975
##  [10,]             21374
##  [11,]             22328
##  [12,]             46315
##  [13,]             50717
##  [14,]             22139
##  [15,]             22571
##  [16,]             24334
##  [17,]             39411
##  [18,]             24253
##  [19,]            118361
##  [20,]             34141
##  [21,]             24731
##  [22,]             25585
##  [23,]             21433
##  [24,]             53813
##  [25,]             71349
##  [26,]             27932
##  [27,]             27068
##  [28,]             24150
##  [29,]             52206
##  [30,]             13814
##  [31,]            140578
##  [32,]             26706
##  [33,]             30904
##  [34,]             30941
##  [35,]             26229
##  [36,]             66107
##  [37,]             91912
##  [38,]             30414
##  [39,]             33408
##  [40,]             45319
##  [41,]             41580
##  [42,]            106245
##  [43,]            100674
##  [44,]             30121
##  [45,]             40614
##  [46,]             27611
##  [47,]             20591
##  [48,]             64931
##  [49,]             71755
##  [50,]             34453
##  [51,]             39112
##  [52,]             48870
##  [53,]             37354
##  [54,]            176599
##  [55,]             77002
##  [56,]             37200
##  [57,]             45403
##  [58,]             33180
##  [59,]             36639
##  [60,]             78096
##  [61,]             64179
##  [62,]             35357
##  [63,]             41663
##  [64,]             51756
##  [65,]             16184
##  [66,]            192900
##  [67,]             83148
##  [68,]             33242
##  [69,]             39699
##  [70,]             40015
##  [71,]             40052
##  [72,]            108701
##  [73,]             60961
##  [74,]             30710
##  [75,]             17779
##  [76,]                 0
##  [77,]                 0
##  [78,]                 0
##  [79,]              2481
##  [80,]             29679
##  [81,]             14428
##  [82,]             17690
##  [83,]             15964
##  [84,]             10926
##  [85,]             12787
##  [86,]              5718
##  [87,]             13184
##  [88,]             10387
##  [89,]             46598
##  [90,]             24281
##  [91,]               336
##  [92,]               175
##  [93,]              5416
##  [94,]             16706
##  [95,]             15982
##  [96,]             27019
##  [97,]             39566
##  [98,]             21653
##  [99,]             22694
## [100,]              4233
## [101,]            115430
## [102,]             45257
## [103,]             41466
## [104,]             19804
## [105,]             19892
## [106,]             18482
## [107,]             15320
## [108,]             38087

str(datats)

##  Time-Series [1:108, 1] from 1 to 108: 47237 18009 17462 17974 24137 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr "Jumlah Pengunjung"

Plot data time series

plot.ts(datats)

Plot dan hasil korelasi ACF

#Menampilkan plot korelasi ACF
acf(datats)

#Menampilkan hasil korelasi ACF
print(acf(datats))

## 
## Autocorrelations of series 'datats', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.223  0.084  0.029 -0.021  0.197  0.477  0.143 -0.012 -0.006 -0.076 
##     11     12     13     14     15     16     17     18     19     20 
##  0.158  0.535  0.043 -0.005 -0.081 -0.098  0.160  0.281 -0.047 -0.140

Plot dan hasil korelasi PACF

#Menampilkan plot korelasi PACF
pacf(datats)

#Menampilkan hasil korelasi PACF
print(pacf(datats))

## 
## Partial autocorrelations of series 'datats', by lag
## 
##      1      2      3      4      5      6      7      8      9     10     11 
##  0.223  0.036  0.003 -0.033  0.218  0.435 -0.049 -0.136  0.020 -0.040  0.060 
##     12     13     14     15     16     17     18     19     20 
##  0.404 -0.197 -0.083 -0.040  0.019  0.035 -0.093 -0.142 -0.109

Differencing

#Melakukan differencing karena plot time series sebelumnya terlihat bahwa data jumlah pengunjung di lokawisata baturaden belum stasioner dalam rata-rata maupun variansi atau plot sebelumnya tidak berfluktuasi disekitar titik nol/konstan
datadiff1 <- diff(datats, differences = 1)

#Menampilkan plot time series setelah di differencing
plot.ts(datadiff1)

Plot dan hasil korelasi ACF setelah differencing

#Menampilkan plot korelasi ACF
acf(datadiff1)

#Menampilkan hasil korelasi ACF
print(acf(datadiff1))

## 
## Autocorrelations of series 'datadiff1', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000 -0.409 -0.054 -0.003 -0.173 -0.040  0.395 -0.115 -0.103  0.049 -0.196 
##     11     12     13     14     15     16     17     18     19     20 
## -0.093  0.559 -0.284  0.017 -0.038 -0.177  0.089  0.285 -0.147 -0.079

Plot dan hasil korelasi PACF setelah differencing

#Menampilkan plot korelasi PACF
pacf(datadiff1)

#Menampilkan hasil korelasi PACF
print(pacf(datadiff1))

## 
## Partial autocorrelations of series 'datadiff1', by lag
## 
##      1      2      3      4      5      6      7      8      9     10     11 
## -0.409 -0.267 -0.182 -0.359 -0.490  0.011  0.089 -0.071 -0.010 -0.106 -0.427 
##     12     13     14     15     16     17     18     19     20 
##  0.182  0.059  0.010 -0.051 -0.064  0.065  0.100  0.060 -0.094

Pemodelan ARIMA

#Model ARIMA yang mungkin
fit1 = arima(datats, order = c(1, 1, 0))
fit2 = arima(datats, order = c(0, 1, 1))
fit3 = arima(datats, order = c(1, 1, 1))
fit4 = arima(datats, order = c(2, 1, 0))
fit5 = arima(datats, order = c(2, 1, 1))
fit6 = arima(datats, order = c(2, 1, 2))
fit7 = arima(datats, order = c(1, 1, 2))
fit8 = arima(datats, order = c(3, 1, 2))
fit9 = arima(datats, order = c(4, 1, 2))
fit10 = arima(datats, order = c(4, 1, 3))
fit11 = arima(datats, order = c(5, 1, 2))
fit12 = arima(datats, order = c(5, 1, 3))
fit13 = arima(datats, order = c(5, 1, 6))
fit14 = arima(datats, order = c(6, 1, 2))
fit15 = arima(datats, order = c(6, 1, 3))
fit16 = arima(datats, order = c(6, 1, 5))

## Warning in arima(datats, order = c(6, 1, 5)): possible convergence problem:
## optim gave code = 1

Nilai AIC tiap model

fit1$aic

## [1] 2566.029

fit2$aic

## [1] 2534.225

fit3$aic

## [1] 2535.298

fit4$aic

## [1] 2560.107

fit5$aic

## [1] 2536.807

fit6$aic

## [1] 2536.404

fit7$aic

## [1] 2537.102

fit8$aic

## [1] 2536.319

fit9$aic

## [1] 2518.564

fit10$aic

## [1] 2514.167

fit11$aic

## [1] 2521.309

fit12$aic

## [1] 2517.17

fit13$aic

## [1] 2504.249

fit14$aic

## [1] 2518.242

fit15$aic

## [1] 2522.818

fit16$aic

## [1] 2504.005

Nilai BIC tiap model

BIC(fit1)

## [1] 2571.374

BIC(fit2)

## [1] 2539.57

BIC(fit3)

## [1] 2543.317

BIC(fit4)

## [1] 2568.126

BIC(fit5)

## [1] 2547.499

BIC(fit6)

## [1] 2549.768

BIC(fit7)

## [1] 2547.793

BIC(fit8)

## [1] 2552.356

BIC(fit9)

## [1] 2537.274

BIC(fit10)

## [1] 2535.55

BIC(fit11)

## [1] 2542.692

BIC(fit12)

## [1] 2541.225

BIC(fit13)

## [1] 2536.323

BIC(fit14)

## [1] 2542.297

BIC(fit15)

## [1] 2549.546

BIC(fit16)

## [1] 2536.079

Verifikasi Model

#Pemodelan terbaik, yaitu pada fit16 karena nilai AIC = 2.504,005 dan nilai BIC = 2.536,079 dan karena dari plot histogram dan p-value > 0.05
tsdiag(fit16)

Peramalan

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

fit16

## 
## Call:
## arima(x = datats, order = c(6, 1, 5))
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5     ar6      ma1    ma2
##       -0.4096  -0.7823  -0.3757  -0.8494  -0.2165  0.1333  -0.3816  0.540
## s.e.   0.2092   0.2141   0.2010   0.1802   0.2124  0.1824   0.1837  0.084
##           ma3     ma4      ma5
##       -0.4156  0.6523  -0.7300
## s.e.   0.1125  0.1051   0.1227
## 
## sigma^2 estimated as 629065482:  log likelihood = -1240,  aic = 2504

fit16$var.coef

##              ar1          ar2          ar3          ar4          ar5
## ar1  0.043763159  0.038322295  0.041273108  0.031707835  0.043646026
## ar2  0.038322295  0.045851898  0.037658873  0.037657017  0.039687939
## ar3  0.041273108  0.037658873  0.040389799  0.031428789  0.041956811
## ar4  0.031707835  0.037657017  0.031428789  0.032456965  0.033275640
## ar5  0.043646026  0.039687939  0.041956811  0.033275640  0.045127257
## ar6  0.031278069  0.038331372  0.030976939  0.032082437  0.032713375
## ma1 -0.033430815 -0.032509555 -0.031508844 -0.026216022 -0.033483107
## ma2 -0.009980524 -0.012666299 -0.009497476 -0.010005227 -0.009822274
## ma3 -0.015607428 -0.014875770 -0.016116227 -0.012899224 -0.016324796
## ma4  0.008012125  0.007421267  0.006577253  0.004564824  0.006868174
## ma5 -0.017172535 -0.015974027 -0.016479799 -0.014295275 -0.018385903
##              ar6          ma1          ma2          ma3          ma4
## ar1  0.031278069 -0.033430815 -0.009980524 -0.015607428  0.008012125
## ar2  0.038331372 -0.032509555 -0.012666299 -0.014875770  0.007421267
## ar3  0.030976939 -0.031508844 -0.009497476 -0.016116227  0.006577253
## ar4  0.032082437 -0.026216022 -0.010005227 -0.012899224  0.004564824
## ar5  0.032713375 -0.033483107 -0.009822274 -0.016324796  0.006868174
## ar6  0.033269241 -0.027290038 -0.009853798 -0.012503664  0.006272624
## ma1 -0.027290038  0.033747847  0.009072081  0.013461513 -0.005552867
## ma2 -0.009853798  0.009072081  0.007051905  0.004110697 -0.001523462
## ma3 -0.012503664  0.013461513  0.004110697  0.012647204 -0.003000927
## ma4  0.006272624 -0.005552867 -0.001523462 -0.003000927  0.011038802
## ma5 -0.013719438  0.015058823  0.003645952  0.010965617 -0.003304247
##              ma5
## ar1 -0.017172535
## ar2 -0.015974027
## ar3 -0.016479799
## ar4 -0.014295275
## ar5 -0.018385903
## ar6 -0.013719438
## ma1  0.015058823
## ma2  0.003645952
## ma3  0.010965617
## ma4 -0.003304247
## ma5  0.015047382

mean(datadiff1)

## [1] -85.51402

mean(datats)

## [1] 39448.11

dt <- forecast(fit16)
plot(dt)

Box.test(dt$residuals)

## 
##  Box-Pierce test
## 
## data:  dt$residuals
## X-squared = 0.014337, df = 1, p-value = 0.9047

hist(dt$residuals)

checkresiduals(fit16)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(6,1,5)
## Q* = 9.8178, df = 3, p-value = 0.02018
## 
## Model df: 11.   Total lags used: 14

predict(fit16, n.ahead = 12)

## $pred
## Time Series:
## Start = 109 
## End = 120 
## Frequency = 1 
##  [1] 27252.89 18614.28 19152.11 30391.13 31222.43 34604.51 28316.12 17119.66
##  [9] 22287.15 29737.84 31569.23 34370.94
## 
## $se
## Time Series:
## Start = 109 
## End = 120 
## Frequency = 1 
##  [1] 25297.62 25788.78 26771.96 26880.62 27312.02 27313.49 28461.96 29660.08
##  [9] 29830.33 30214.68 30347.92 30399.88

Project Akhir PDS

Nadia Putri Anggraeni

2023-07-16

LOAD DATA

Exploratory Data Analysis (EDA)

Plot data time series

Plot dan hasil korelasi ACF

Plot dan hasil korelasi PACF

Differencing

Plot dan hasil korelasi ACF setelah differencing

Plot dan hasil korelasi PACF setelah differencing

Pemodelan ARIMA

Nilai AIC tiap model

Nilai BIC tiap model

Verifikasi Model

Peramalan