1. Package
library(readr)
library(tidyverse)## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ purrr 1.0.1
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
library(DataExplorer)
library(Kendall)
library(vroom)##
## Attaching package: 'vroom'
##
## The following objects are masked from 'package:readr':
##
## as.col_spec, col_character, col_date, col_datetime, col_double,
## col_factor, col_guess, col_integer, col_logical, col_number,
## col_skip, col_time, cols, cols_condense, cols_only, date_names,
## date_names_lang, date_names_langs, default_locale, fwf_cols,
## fwf_empty, fwf_positions, fwf_widths, locale, output_column,
## problems, spec
library(readxl)
library(MASS)##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(ggplot2)
library(car)## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
2. Deskripsi
| Var | Nama | Keterangan |
| Y | Penduduk Miskin | |
| X1 | Jumlah Penduduk | |
| X2 | Gini Ratio | |
| X3 | Kepadatan | |
| X4 | KHL | |
| X5 | ProporsI Lapangan Kerja Informal | |
| X6 | Proporsi Lapangan Kerja Informal Sektor Non-Pertanian | |
| X7 | TPT | |
| X8 | IPM | |
| X9 | Wilayah Berdasar Zona Waktu |
3. Input Data
data <- read_xlsx("Data Poisson.xlsx", sheet = "Data Kemiskinan")
data$Wilayah <- as.factor(data$Wilayah)str(data)## tibble [34 × 10] (S3: tbl_df/tbl/data.frame)
## $ Provinsi : chr [1:34] "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
## $ Penduduk.Miskin : num [1:34] 806 1268 335 485 279 ...
## $ Jumlah.Penduduk : num [1:34] 5388 14798 5546 6951 3604 ...
## $ Gini.Ratio : num [1:34] 0.348 0.343 0.329 0.363 0.351 0.358 0.37 0.341 0.248 0.342 ...
## $ Kepadatan : num [1:34] 92 205 133 75 72 93 102 262 90 258 ...
## $ KHL : num [1:34] 1732413 1271058 1474227 1872000 1708174 ...
## $ Proporsi.Lapangan.Kerja.Informal: num [1:34] 61.5 59.1 65.3 54 60 ...
## $ TPT : num [1:34] 5.97 5.47 6.17 4.4 4.7 4.74 3.39 4.31 4.18 8.02 ...
## $ IPM : num [1:34] 72.8 72.7 73.3 73.5 72.1 ...
## $ Wilayah : Factor w/ 3 levels "Barat","Tengah",..: 1 1 1 1 1 1 1 1 1 1 ...
data.poisson <- data[-1]str(data.poisson)## tibble [34 × 9] (S3: tbl_df/tbl/data.frame)
## $ Penduduk.Miskin : num [1:34] 806 1268 335 485 279 ...
## $ Jumlah.Penduduk : num [1:34] 5388 14798 5546 6951 3604 ...
## $ Gini.Ratio : num [1:34] 0.348 0.343 0.329 0.363 0.351 0.358 0.37 0.341 0.248 0.342 ...
## $ Kepadatan : num [1:34] 92 205 133 75 72 93 102 262 90 258 ...
## $ KHL : num [1:34] 1732413 1271058 1474227 1872000 1708174 ...
## $ Proporsi.Lapangan.Kerja.Informal: num [1:34] 61.5 59.1 65.3 54 60 ...
## $ TPT : num [1:34] 5.97 5.47 6.17 4.4 4.7 4.74 3.39 4.31 4.18 8.02 ...
## $ IPM : num [1:34] 72.8 72.7 73.3 73.5 72.1 ...
## $ Wilayah : Factor w/ 3 levels "Barat","Tengah",..: 1 1 1 1 1 1 1 1 1 1 ...
4. EDA
4.1. Summary data
summary(data.poisson)## Penduduk.Miskin Jumlah.Penduduk Gini.Ratio Kepadatan
## Min. : 49.0 Min. : 708.4 Min. :0.2480 Min. : 9.00
## 1st Qu.: 197.5 1st Qu.: 2360.3 1st Qu.:0.3357 1st Qu.: 54.25
## Median : 342.5 Median : 4093.9 Median :0.3495 Median : 103.50
## Mean : 769.0 Mean : 7929.5 Mean :0.3573 Mean : 744.26
## 3rd Qu.: 812.0 3rd Qu.: 8138.9 3rd Qu.:0.3880 3rd Qu.: 261.00
## Max. :4181.0 Max. :49565.2 Max. :0.4480 Max. :15978.00
## KHL Proporsi.Lapangan.Kerja.Informal TPT
## Min. : 825000 Min. :36.32 Min. :3.110
## 1st Qu.:1480618 1st Qu.:53.58 1st Qu.:3.985
## Median :1699587 Median :60.22 Median :4.775
## Mean :1710832 Mean :59.95 Mean :5.123
## 3rd Qu.:2014971 3rd Qu.:65.19 3rd Qu.:5.923
## Max. :2538174 Max. :84.11 Max. :8.530
## IPM Wilayah
## Min. :61.39 Barat :18
## 1st Qu.:70.23 Tengah:12
## Median :72.19 Timur : 4
## Mean :71.97
## 3rd Qu.:73.22
## Max. :81.65
4.2. Outlier
boxplot(Penduduk.Miskin ~ Wilayah, data = data.poisson,
main = "Box Plot Jumlah Penduduk Miskin dan Wilayah",
col = c("lightcyan","lavender", "mistyrose"),
horizontal = TRUE
)boxplot(data$Penduduk.Miskin,
horizontal = TRUE)Terdapat 3 Outlier, lebih baik dihilangkan dari pembentukan model
4.3. Distribusi Poisson
hist(data.poisson$Penduduk.Miskin, probability = TRUE)
lines(density(data.poisson$Penduduk.Miskin), col=2, lwd=2)Histogram menunjukkan bahwa data variabel respon berdistribusi Poisson
#calculate empirical CDF of data
p = ecdf(data.poisson$Penduduk.Miskin)
#plot CDF
plot(p)Plot fungsi kepadatan peluang juga menunjukkan bahwa data variabel respon berdistribusi poisson
library(bbmle)## Loading required package: stats4
##
## Attaching package: 'bbmle'
## The following object is masked from 'package:dplyr':
##
## slice
mle2(Penduduk.Miskin ~ dpois(lambda),
data=data.poisson,
start=list(lambda=1))## Warning in dpois(x = c(806, 1268, 335, 485, 279, 1044, 297, 1002, 66, 151, :
## NaNs produced
## Warning in dpois(x = c(806, 1268, 335, 485, 279, 1044, 297, 1002, 66, 151, :
## NaNs produced
## Warning in dpois(x = c(806, 1268, 335, 485, 279, 1044, 297, 1002, 66, 151, :
## NaNs produced
##
## Call:
## mle2(minuslogl = Penduduk.Miskin ~ dpois(lambda), start = list(lambda = 1),
## data = data.poisson)
##
## Coefficients:
## lambda
## 768.9708
##
## Log-likelihood: -16560.95
mean(data.poisson$Penduduk.Miskin)## [1] 768.9706
library(stats)
ks.test(data.poisson$Penduduk.Miskin, "rpois", lambda=768.9708)## Warning in ks.test.default(data.poisson$Penduduk.Miskin, "rpois", lambda =
## 768.9708): ties should not be present for the Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: data.poisson$Penduduk.Miskin
## D = 835.06, p-value < 2.2e-16
## alternative hypothesis: two-sided
Uji dengan KS.test juga signifikan, sehingga data variabel respon berdistribusi poisson
5. Create Model
5.1. Model Poisson
model.poisson <- glm(Penduduk.Miskin ~ ., data = data.poisson, family = "poisson")
summary(model.poisson)##
## Call:
## glm(formula = Penduduk.Miskin ~ ., family = "poisson", data = data.poisson)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.602e+00 4.597e-01 12.186 < 2e-16 ***
## Jumlah.Penduduk 4.437e-05 1.002e-06 44.301 < 2e-16 ***
## Gini.Ratio -1.897e+00 2.545e-01 -7.453 9.12e-14 ***
## Kepadatan 9.583e-05 4.460e-06 21.486 < 2e-16 ***
## KHL -5.572e-07 3.839e-08 -14.514 < 2e-16 ***
## Proporsi.Lapangan.Kerja.Informal 4.812e-02 1.795e-03 26.802 < 2e-16 ***
## TPT 5.791e-02 7.059e-03 8.204 2.33e-16 ***
## IPM -1.692e-02 4.836e-03 -3.498 0.000469 ***
## WilayahTengah -3.595e-01 2.224e-02 -16.163 < 2e-16 ***
## WilayahTimur -5.303e-01 4.069e-02 -13.033 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 32854.7 on 33 degrees of freedom
## Residual deviance: 3083.5 on 24 degrees of freedom
## AIC: 3370.7
##
## Number of Fisher Scoring iterations: 5
Tidak terpenuhi equidispersi sehingga galat baku underestimate sehingga P-value nya cenderng gampang menolak H0
5.1.1 Multikolinieritas
vif(model.poisson)## GVIF Df GVIF^(1/(2*Df))
## Jumlah.Penduduk 9.399541 1 3.065867
## Gini.Ratio 2.887754 1 1.699339
## Kepadatan 2.499372 1 1.580940
## KHL 9.083746 1 3.013925
## Proporsi.Lapangan.Kerja.Informal 6.215647 1 2.493120
## TPT 3.710630 1 1.926299
## IPM 6.026537 1 2.454901
## Wilayah 3.753311 2 1.391886
Tidak terdapat multikolinieritas antar variabel
5.1.2. Test Over dispersi
3083.5/24 ## [1] 128.4792
library(AER)## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
dispersiontest(model.poisson)##
## Overdispersion test
##
## data: model.poisson
## z = 5.0288, p-value = 2.467e-07
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 88.81769
Terjadi overdispersi pada variabel respon
library(DHARMa)## This is DHARMa 0.4.6. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
sim.poisson <- simulateResiduals(model.poisson, refit = TRUE)
testOverdispersion(sim.poisson)## testOverdispersion is deprecated, switch your code to using the testDispersion function
##
## DHARMa nonparametric dispersion test via mean deviance residual fitted
## vs. simulated-refitted
##
## data: simulationOutput
## dispersion = 128.89, p-value < 2.2e-16
## alternative hypothesis: two.sided
5.1.3. Plot Residual
plotSimulatedResiduals(sim.poisson)## plotSimulatedResiduals is deprecated, please switch your code to simply using the plot() function
5.2. Negative binomial (semua variabel prediktor disertakan)
model.nb <- glm.nb (Penduduk.Miskin~., data = data.poisson)
summary(model.nb)##
## Call:
## glm.nb(formula = Penduduk.Miskin ~ ., data = data.poisson, init.theta = 5.242434309,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.355e-01 4.334e+00 -0.054 0.9567
## Jumlah.Penduduk 5.135e-05 1.211e-05 4.239 2.24e-05 ***
## Gini.Ratio 8.191e-01 2.445e+00 0.335 0.7376
## Kepadatan 4.890e-05 4.335e-05 1.128 0.2593
## KHL -2.404e-07 3.732e-07 -0.644 0.5194
## Proporsi.Lapangan.Kerja.Informal 6.745e-02 1.717e-02 3.929 8.54e-05 ***
## TPT 1.890e-01 7.943e-02 2.379 0.0173 *
## IPM 1.702e-02 4.641e-02 0.367 0.7138
## WilayahTengah -3.769e-01 1.921e-01 -1.962 0.0497 *
## WilayahTimur -6.037e-01 3.591e-01 -1.681 0.0927 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(5.2424) family taken to be 1)
##
## Null deviance: 219.864 on 33 degrees of freedom
## Residual deviance: 35.077 on 24 degrees of freedom
## AIC: 474.46
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 5.24
## Std. Err.: 1.26
##
## 2 x log-likelihood: -452.465
Beberapa variabel tidak signifikan dan akan dihilangkan dari pembentukan model.
5.3. Binomial negative (hanya variabel yang siginifikan)
model.nb2 <- glm.nb (Penduduk.Miskin~ Jumlah.Penduduk+Proporsi.Lapangan.Kerja.Informal+TPT+Wilayah, data = data.poisson)
summary(model.nb2)##
## Call:
## glm.nb(formula = Penduduk.Miskin ~ Jumlah.Penduduk + Proporsi.Lapangan.Kerja.Informal +
## TPT + Wilayah, data = data.poisson, init.theta = 4.678079991,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.476e+00 9.999e-01 1.476 0.13982
## Jumlah.Penduduk 5.949e-05 9.035e-06 6.585 4.56e-11 ***
## Proporsi.Lapangan.Kerja.Informal 5.853e-02 1.180e-02 4.961 7.02e-07 ***
## TPT 1.800e-01 8.316e-02 2.165 0.03039 *
## WilayahTengah -4.168e-01 1.951e-01 -2.136 0.03268 *
## WilayahTimur -8.259e-01 3.136e-01 -2.634 0.00845 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(4.6781) family taken to be 1)
##
## Null deviance: 196.429 on 33 degrees of freedom
## Residual deviance: 35.254 on 28 degrees of freedom
## AIC: 470.57
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 4.68
## Std. Err.: 1.12
##
## 2 x log-likelihood: -456.571
5.3.1 Test Over dispersi
sim.nb2 <- simulateResiduals(model.nb2, refit = TRUE, n=99)
testOverdispersion(sim.nb2)## testOverdispersion is deprecated, switch your code to using the testDispersion function
##
## DHARMa nonparametric dispersion test via mean deviance residual fitted
## vs. simulated-refitted
##
## data: simulationOutput
## dispersion = 0.89895, p-value = 0.303
## alternative hypothesis: two.sided
Nlai dispersion = 0.8989, mendekati 1 (tidak lagi terjadi overdispersi pada model negatif binomial)
5.3.2. Plot Residual
plotSimulatedResiduals(sim.nb2)## plotSimulatedResiduals is deprecated, please switch your code to simply using the plot() function
## DHARMa::testOutliers: nBoot > nSim does not make much sense, thus changed to nBoot = nSim. If you want to increase nBoot, increase nSim in DHARMa::simulateResiduals as well.
6. Perbandingan Model
6.1. Perbandingan AIC
Nama.Model <- c("Model.Poisson", "Model.nb", "Model.nb2")
Nilai.AIC <- c(AIC(model.poisson), AIC(model.nb), AIC(model.nb2))
df <- data.frame(Nama.Model, Nilai.AIC)
print(df)## Nama.Model Nilai.AIC
## 1 Model.Poisson 3370.7327
## 2 Model.nb 474.4648
## 3 Model.nb2 470.5705
Model negatif binomial dengan seleksi beberapa variabel menunjukan nilai AIC yang lebih kecil
6.2. Plot Prediksi
prediksi.nb <- predict(model.nb, data = data.poisson, type = "response")
prediksi.poisson <- predict(model.poisson, data = data.poisson, type = "response")plot.prediksi <- data.frame( data.poisson$Penduduk.Miskin, prediksi.nb, prediksi.poisson)
colnames(plot.prediksi) <- c("Penduduk_Miskin", "Prediksi_Negative_Binomial", "Prediksi_Poisson")
plot.prediksi## Penduduk_Miskin Prediksi_Negative_Binomial Prediksi_Poisson
## 1 806 618.14319 542.0508
## 2 1268 867.59911 942.3193
## 3 335 885.32820 791.0109
## 4 485 298.64541 329.0612
## 5 279 400.22664 440.9737
## 6 1044 602.09389 562.9618
## 7 297 512.45751 594.6896
## 8 1002 1080.70435 1156.9733
## 9 66 136.23110 233.5222
## 10 151 146.27898 139.6040
## 11 502 499.98138 514.0109
## 12 4070 8155.04949 4761.3663
## 13 3831 3332.61135 2989.0587
## 14 454 345.54137 383.2262
## 15 4181 4593.98018 4366.4813
## 16 814 696.48920 588.0338
## 17 205 218.60452 236.5721
## 18 731 784.76503 744.8363
## 19 1131 581.12912 779.8757
## 20 350 402.60861 541.0312
## 21 145 169.58993 203.6248
## 22 195 214.80749 278.7523
## 23 236 140.72886 128.6405
## 24 49 95.45865 155.3389
## 25 185 366.60047 312.6241
## 26 388 378.65151 486.3034
## 27 777 553.08859 400.2148
## 28 309 280.98987 299.6829
## 29 185 250.99591 276.9495
## 30 165 539.59353 456.2095
## 31 290 284.75435 269.8443
## 32 79 235.69352 252.9406
## 33 218 179.57711 226.5107
## 34 922 645.74151 759.7043
x <- 1:34
plot(x, plot.prediksi$Penduduk_Miskin,
type = "o",
xlab = "Amatan",
ylab = "Jumlah Penduduk Miskin",
main = "Plot Prediksi dan Data Asli"
)
lines(x, plot.prediksi$Prediksi_Negative_Binomial,
type = "l",
col = "red"
)
lines(x, plot.prediksi$Prediksi_Poisson,
type = "l",
col = "green"
) 7. Interpretasi Koefisien Regresi
data.frame(model.nb2$coefficients, exp(model.nb2$coefficients))## model.nb2.coefficients
## (Intercept) 1.476341e+00
## Jumlah.Penduduk 5.948951e-05
## Proporsi.Lapangan.Kerja.Informal 5.852904e-02
## TPT 1.800434e-01
## WilayahTengah -4.167909e-01
## WilayahTimur -8.259177e-01
## exp.model.nb2.coefficients.
## (Intercept) 4.3768997
## Jumlah.Penduduk 1.0000595
## Proporsi.Lapangan.Kerja.Informal 1.0602758
## TPT 1.1972693
## WilayahTengah 0.6591587
## WilayahTimur 0.4378330
| Variabel | Exp (coef) | Interpretasi |
|---|---|---|
| Jumlah penduduk | 1.0000595 | Setiap peningkatan 1000 jumlah penduduk, akan meningkatkan jumlah penduduk miskin 1.0000595 kali lipat |
| Proporsi lapangan kerja informal | 1.0602758 | Setiap peningkatan 1% proporsi lapangan kerja informal, akan meningkatkan jumlah penduduk miskin 1.0602758 kali lipat. |
| TPT | 1.1972693 | Setiap peningkatan 1% tingkat pengangguran terbuka, akan meningkatkan jumlah penduduk miskin 1.1972693 kali lipat. |
| Wilayah tengah | 0.6591587 | Peluang daerah yang berada di wilayah tengah untuk mengalami peningkatan penduduk miskin adalah 0.6591587 kali dibanding daerah yang berada di wilayah barat |
| Wilayah timur | 0.4378330 | Peluang daerah yang berada di wilayah timur untuk mengalami peningkatan penduduk miskin adalah 0.437833 kali dibanding daerah yang berada di wilayah barat |
8. Prediksi data baru
| No | Jumlah.Penduduk | Proporsi.Lapangan.Kerja.Informal | TPT | Wilayah |
| 1 | 14798,4 | 59,07 | 4,7 | Barat |
| 2 | 6951,2 | 59,98 | 4,31 | Barat |
| 3 | 1469,8 | 67,54 | 8,02 | Barat |
| 4 | 10576,4 | 49,63 | 8,35 | Barat |
| 5 | 34738,2 | 36,88 | 3,73 | Barat |
| 6 | 3919,2 | 63,86 | 4,84 | Tengah |
| 7 | 2686,3 | 53,43 | 3,3 | Tengah |
| 8 | 3664,7 | 58,77 | 4,2 | Tengah |
| 9 | 2512,9 | 44,26 | 4,62 | Tengah |
| 10 | 8888,8 | 63,55 | 6,51 | Timur |
data.baru <- read_xlsx("Data Poisson.xlsx", sheet = "Data Kemiskinan Baru")
data.baru$Wilayah <- as.factor(data.baru$Wilayah)
data.baru## # A tibble: 10 × 4
## Jumlah.Penduduk Proporsi.Lapangan.Kerja.Informal TPT Wilayah
## <dbl> <dbl> <dbl> <fct>
## 1 14798. 59.1 4.7 Barat
## 2 6951. 60.0 4.31 Barat
## 3 1470. 67.5 8.02 Barat
## 4 10576. 49.6 8.35 Barat
## 5 34738. 36.9 3.73 Barat
## 6 3919. 63.9 4.84 Tengah
## 7 2686. 53.4 3.3 Tengah
## 8 3665. 58.8 4.2 Tengah
## 9 2513. 44.3 4.62 Tengah
## 10 8889. 63.6 6.51 Timur
data.frame(predict(model.nb2, newdata = data.baru, type = "response"))## predict.model.nb2..newdata...data.baru..type....response..
## 1 780.7106
## 2 481.2674
## 3 1054.4532
## 4 674.3060
## 5 585.8197
## 6 365.6862
## 7 139.8690
## 8 238.2916
## 9 102.6512
## 10 433.0340