1. Package

library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.1
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
library(DataExplorer)
library(Kendall)
library(vroom)
## 
## Attaching package: 'vroom'
## 
## The following objects are masked from 'package:readr':
## 
##     as.col_spec, col_character, col_date, col_datetime, col_double,
##     col_factor, col_guess, col_integer, col_logical, col_number,
##     col_skip, col_time, cols, cols_condense, cols_only, date_names,
##     date_names_lang, date_names_langs, default_locale, fwf_cols,
##     fwf_empty, fwf_positions, fwf_widths, locale, output_column,
##     problems, spec
library(readxl)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

2. Deskripsi

Var Nama Keterangan
Y Penduduk Miskin  
X1 Jumlah Penduduk  
X2 Gini Ratio  
X3 Kepadatan  
X4 KHL  
X5 ProporsI Lapangan Kerja Informal  
X6 Proporsi Lapangan Kerja Informal Sektor Non-Pertanian  
X7 TPT  
X8 IPM  
X9 Wilayah Berdasar Zona Waktu  

3. Input Data

data <- read_xlsx("Data Poisson.xlsx", sheet = "Data Kemiskinan")
data$Wilayah <- as.factor(data$Wilayah)
str(data)
## tibble [34 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Provinsi                        : chr [1:34] "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
##  $ Penduduk.Miskin                 : num [1:34] 806 1268 335 485 279 ...
##  $ Jumlah.Penduduk                 : num [1:34] 5388 14798 5546 6951 3604 ...
##  $ Gini.Ratio                      : num [1:34] 0.348 0.343 0.329 0.363 0.351 0.358 0.37 0.341 0.248 0.342 ...
##  $ Kepadatan                       : num [1:34] 92 205 133 75 72 93 102 262 90 258 ...
##  $ KHL                             : num [1:34] 1732413 1271058 1474227 1872000 1708174 ...
##  $ Proporsi.Lapangan.Kerja.Informal: num [1:34] 61.5 59.1 65.3 54 60 ...
##  $ TPT                             : num [1:34] 5.97 5.47 6.17 4.4 4.7 4.74 3.39 4.31 4.18 8.02 ...
##  $ IPM                             : num [1:34] 72.8 72.7 73.3 73.5 72.1 ...
##  $ Wilayah                         : Factor w/ 3 levels "Barat","Tengah",..: 1 1 1 1 1 1 1 1 1 1 ...
data.poisson <- data[-1]
str(data.poisson)
## tibble [34 × 9] (S3: tbl_df/tbl/data.frame)
##  $ Penduduk.Miskin                 : num [1:34] 806 1268 335 485 279 ...
##  $ Jumlah.Penduduk                 : num [1:34] 5388 14798 5546 6951 3604 ...
##  $ Gini.Ratio                      : num [1:34] 0.348 0.343 0.329 0.363 0.351 0.358 0.37 0.341 0.248 0.342 ...
##  $ Kepadatan                       : num [1:34] 92 205 133 75 72 93 102 262 90 258 ...
##  $ KHL                             : num [1:34] 1732413 1271058 1474227 1872000 1708174 ...
##  $ Proporsi.Lapangan.Kerja.Informal: num [1:34] 61.5 59.1 65.3 54 60 ...
##  $ TPT                             : num [1:34] 5.97 5.47 6.17 4.4 4.7 4.74 3.39 4.31 4.18 8.02 ...
##  $ IPM                             : num [1:34] 72.8 72.7 73.3 73.5 72.1 ...
##  $ Wilayah                         : Factor w/ 3 levels "Barat","Tengah",..: 1 1 1 1 1 1 1 1 1 1 ...

4. EDA

4.1. Summary data

summary(data.poisson)
##  Penduduk.Miskin  Jumlah.Penduduk     Gini.Ratio       Kepadatan       
##  Min.   :  49.0   Min.   :  708.4   Min.   :0.2480   Min.   :    9.00  
##  1st Qu.: 197.5   1st Qu.: 2360.3   1st Qu.:0.3357   1st Qu.:   54.25  
##  Median : 342.5   Median : 4093.9   Median :0.3495   Median :  103.50  
##  Mean   : 769.0   Mean   : 7929.5   Mean   :0.3573   Mean   :  744.26  
##  3rd Qu.: 812.0   3rd Qu.: 8138.9   3rd Qu.:0.3880   3rd Qu.:  261.00  
##  Max.   :4181.0   Max.   :49565.2   Max.   :0.4480   Max.   :15978.00  
##       KHL          Proporsi.Lapangan.Kerja.Informal      TPT       
##  Min.   : 825000   Min.   :36.32                    Min.   :3.110  
##  1st Qu.:1480618   1st Qu.:53.58                    1st Qu.:3.985  
##  Median :1699587   Median :60.22                    Median :4.775  
##  Mean   :1710832   Mean   :59.95                    Mean   :5.123  
##  3rd Qu.:2014971   3rd Qu.:65.19                    3rd Qu.:5.923  
##  Max.   :2538174   Max.   :84.11                    Max.   :8.530  
##       IPM          Wilayah  
##  Min.   :61.39   Barat :18  
##  1st Qu.:70.23   Tengah:12  
##  Median :72.19   Timur : 4  
##  Mean   :71.97              
##  3rd Qu.:73.22              
##  Max.   :81.65

4.2. Outlier

boxplot(Penduduk.Miskin ~ Wilayah, data = data.poisson,
        main = "Box Plot Jumlah Penduduk Miskin dan Wilayah",
        col = c("lightcyan","lavender", "mistyrose"),
        horizontal = TRUE
        )

boxplot(data$Penduduk.Miskin, 
        horizontal = TRUE)

Terdapat 3 Outlier, lebih baik dihilangkan dari pembentukan model

4.3. Distribusi Poisson

hist(data.poisson$Penduduk.Miskin, probability = TRUE)
lines(density(data.poisson$Penduduk.Miskin), col=2, lwd=2)

Histogram menunjukkan bahwa data variabel respon berdistribusi Poisson

#calculate empirical CDF of data
p = ecdf(data.poisson$Penduduk.Miskin)

#plot CDF
plot(p)

Plot fungsi kepadatan peluang juga menunjukkan bahwa data variabel respon berdistribusi poisson

library(bbmle)
## Loading required package: stats4
## 
## Attaching package: 'bbmle'
## The following object is masked from 'package:dplyr':
## 
##     slice
mle2(Penduduk.Miskin ~ dpois(lambda),
     data=data.poisson,
     start=list(lambda=1))
## Warning in dpois(x = c(806, 1268, 335, 485, 279, 1044, 297, 1002, 66, 151, :
## NaNs produced
## Warning in dpois(x = c(806, 1268, 335, 485, 279, 1044, 297, 1002, 66, 151, :
## NaNs produced
## Warning in dpois(x = c(806, 1268, 335, 485, 279, 1044, 297, 1002, 66, 151, :
## NaNs produced
## 
## Call:
## mle2(minuslogl = Penduduk.Miskin ~ dpois(lambda), start = list(lambda = 1), 
##     data = data.poisson)
## 
## Coefficients:
##   lambda 
## 768.9708 
## 
## Log-likelihood: -16560.95
mean(data.poisson$Penduduk.Miskin)
## [1] 768.9706
library(stats)
ks.test(data.poisson$Penduduk.Miskin, "rpois", lambda=768.9708)
## Warning in ks.test.default(data.poisson$Penduduk.Miskin, "rpois", lambda =
## 768.9708): ties should not be present for the Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  data.poisson$Penduduk.Miskin
## D = 835.06, p-value < 2.2e-16
## alternative hypothesis: two-sided

Uji dengan KS.test juga signifikan, sehingga data variabel respon berdistribusi poisson

5. Create Model

5.1. Model Poisson

model.poisson <- glm(Penduduk.Miskin ~ ., data = data.poisson, family = "poisson")
summary(model.poisson)
## 
## Call:
## glm(formula = Penduduk.Miskin ~ ., family = "poisson", data = data.poisson)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       5.602e+00  4.597e-01  12.186  < 2e-16 ***
## Jumlah.Penduduk                   4.437e-05  1.002e-06  44.301  < 2e-16 ***
## Gini.Ratio                       -1.897e+00  2.545e-01  -7.453 9.12e-14 ***
## Kepadatan                         9.583e-05  4.460e-06  21.486  < 2e-16 ***
## KHL                              -5.572e-07  3.839e-08 -14.514  < 2e-16 ***
## Proporsi.Lapangan.Kerja.Informal  4.812e-02  1.795e-03  26.802  < 2e-16 ***
## TPT                               5.791e-02  7.059e-03   8.204 2.33e-16 ***
## IPM                              -1.692e-02  4.836e-03  -3.498 0.000469 ***
## WilayahTengah                    -3.595e-01  2.224e-02 -16.163  < 2e-16 ***
## WilayahTimur                     -5.303e-01  4.069e-02 -13.033  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 32854.7  on 33  degrees of freedom
## Residual deviance:  3083.5  on 24  degrees of freedom
## AIC: 3370.7
## 
## Number of Fisher Scoring iterations: 5

Tidak terpenuhi equidispersi sehingga galat baku underestimate sehingga P-value nya cenderng gampang menolak H0

5.1.1 Multikolinieritas

vif(model.poisson)
##                                      GVIF Df GVIF^(1/(2*Df))
## Jumlah.Penduduk                  9.399541  1        3.065867
## Gini.Ratio                       2.887754  1        1.699339
## Kepadatan                        2.499372  1        1.580940
## KHL                              9.083746  1        3.013925
## Proporsi.Lapangan.Kerja.Informal 6.215647  1        2.493120
## TPT                              3.710630  1        1.926299
## IPM                              6.026537  1        2.454901
## Wilayah                          3.753311  2        1.391886

Tidak terdapat multikolinieritas antar variabel

5.1.2. Test Over dispersi

3083.5/24 
## [1] 128.4792
library(AER)
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
dispersiontest(model.poisson)
## 
##  Overdispersion test
## 
## data:  model.poisson
## z = 5.0288, p-value = 2.467e-07
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   88.81769

Terjadi overdispersi pada variabel respon

library(DHARMa)
## This is DHARMa 0.4.6. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
sim.poisson <- simulateResiduals(model.poisson, refit = TRUE)
testOverdispersion(sim.poisson)
## testOverdispersion is deprecated, switch your code to using the testDispersion function

## 
##  DHARMa nonparametric dispersion test via mean deviance residual fitted
##  vs. simulated-refitted
## 
## data:  simulationOutput
## dispersion = 128.89, p-value < 2.2e-16
## alternative hypothesis: two.sided

5.1.3. Plot Residual

plotSimulatedResiduals(sim.poisson)
## plotSimulatedResiduals is deprecated, please switch your code to simply using the plot() function

5.2. Negative binomial (semua variabel prediktor disertakan)

model.nb <- glm.nb (Penduduk.Miskin~., data = data.poisson)
summary(model.nb)
## 
## Call:
## glm.nb(formula = Penduduk.Miskin ~ ., data = data.poisson, init.theta = 5.242434309, 
##     link = log)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -2.355e-01  4.334e+00  -0.054   0.9567    
## Jumlah.Penduduk                   5.135e-05  1.211e-05   4.239 2.24e-05 ***
## Gini.Ratio                        8.191e-01  2.445e+00   0.335   0.7376    
## Kepadatan                         4.890e-05  4.335e-05   1.128   0.2593    
## KHL                              -2.404e-07  3.732e-07  -0.644   0.5194    
## Proporsi.Lapangan.Kerja.Informal  6.745e-02  1.717e-02   3.929 8.54e-05 ***
## TPT                               1.890e-01  7.943e-02   2.379   0.0173 *  
## IPM                               1.702e-02  4.641e-02   0.367   0.7138    
## WilayahTengah                    -3.769e-01  1.921e-01  -1.962   0.0497 *  
## WilayahTimur                     -6.037e-01  3.591e-01  -1.681   0.0927 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(5.2424) family taken to be 1)
## 
##     Null deviance: 219.864  on 33  degrees of freedom
## Residual deviance:  35.077  on 24  degrees of freedom
## AIC: 474.46
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  5.24 
##           Std. Err.:  1.26 
## 
##  2 x log-likelihood:  -452.465

Beberapa variabel tidak signifikan dan akan dihilangkan dari pembentukan model.

5.3. Binomial negative (hanya variabel yang siginifikan)

model.nb2 <- glm.nb (Penduduk.Miskin~ Jumlah.Penduduk+Proporsi.Lapangan.Kerja.Informal+TPT+Wilayah, data = data.poisson)
summary(model.nb2)
## 
## Call:
## glm.nb(formula = Penduduk.Miskin ~ Jumlah.Penduduk + Proporsi.Lapangan.Kerja.Informal + 
##     TPT + Wilayah, data = data.poisson, init.theta = 4.678079991, 
##     link = log)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       1.476e+00  9.999e-01   1.476  0.13982    
## Jumlah.Penduduk                   5.949e-05  9.035e-06   6.585 4.56e-11 ***
## Proporsi.Lapangan.Kerja.Informal  5.853e-02  1.180e-02   4.961 7.02e-07 ***
## TPT                               1.800e-01  8.316e-02   2.165  0.03039 *  
## WilayahTengah                    -4.168e-01  1.951e-01  -2.136  0.03268 *  
## WilayahTimur                     -8.259e-01  3.136e-01  -2.634  0.00845 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(4.6781) family taken to be 1)
## 
##     Null deviance: 196.429  on 33  degrees of freedom
## Residual deviance:  35.254  on 28  degrees of freedom
## AIC: 470.57
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  4.68 
##           Std. Err.:  1.12 
## 
##  2 x log-likelihood:  -456.571

5.3.1 Test Over dispersi

sim.nb2 <- simulateResiduals(model.nb2, refit = TRUE, n=99)
testOverdispersion(sim.nb2)
## testOverdispersion is deprecated, switch your code to using the testDispersion function

## 
##  DHARMa nonparametric dispersion test via mean deviance residual fitted
##  vs. simulated-refitted
## 
## data:  simulationOutput
## dispersion = 0.89895, p-value = 0.303
## alternative hypothesis: two.sided

Nlai dispersion = 0.8989, mendekati 1 (tidak lagi terjadi overdispersi pada model negatif binomial)

5.3.2. Plot Residual

plotSimulatedResiduals(sim.nb2)
## plotSimulatedResiduals is deprecated, please switch your code to simply using the plot() function
## DHARMa::testOutliers: nBoot > nSim does not make much sense, thus changed to nBoot = nSim. If you want to increase nBoot, increase nSim in DHARMa::simulateResiduals as well.

6. Perbandingan Model

6.1. Perbandingan AIC

Nama.Model <- c("Model.Poisson", "Model.nb", "Model.nb2")
Nilai.AIC <- c(AIC(model.poisson), AIC(model.nb), AIC(model.nb2))
df <- data.frame(Nama.Model, Nilai.AIC)
print(df)
##      Nama.Model Nilai.AIC
## 1 Model.Poisson 3370.7327
## 2      Model.nb  474.4648
## 3     Model.nb2  470.5705

Model negatif binomial dengan seleksi beberapa variabel menunjukan nilai AIC yang lebih kecil

6.2. Plot Prediksi

prediksi.nb <- predict(model.nb, data = data.poisson, type = "response")
prediksi.poisson <- predict(model.poisson, data = data.poisson, type = "response")
plot.prediksi <- data.frame( data.poisson$Penduduk.Miskin, prediksi.nb, prediksi.poisson)

colnames(plot.prediksi) <- c("Penduduk_Miskin", "Prediksi_Negative_Binomial", "Prediksi_Poisson")

plot.prediksi
##    Penduduk_Miskin Prediksi_Negative_Binomial Prediksi_Poisson
## 1              806                  618.14319         542.0508
## 2             1268                  867.59911         942.3193
## 3              335                  885.32820         791.0109
## 4              485                  298.64541         329.0612
## 5              279                  400.22664         440.9737
## 6             1044                  602.09389         562.9618
## 7              297                  512.45751         594.6896
## 8             1002                 1080.70435        1156.9733
## 9               66                  136.23110         233.5222
## 10             151                  146.27898         139.6040
## 11             502                  499.98138         514.0109
## 12            4070                 8155.04949        4761.3663
## 13            3831                 3332.61135        2989.0587
## 14             454                  345.54137         383.2262
## 15            4181                 4593.98018        4366.4813
## 16             814                  696.48920         588.0338
## 17             205                  218.60452         236.5721
## 18             731                  784.76503         744.8363
## 19            1131                  581.12912         779.8757
## 20             350                  402.60861         541.0312
## 21             145                  169.58993         203.6248
## 22             195                  214.80749         278.7523
## 23             236                  140.72886         128.6405
## 24              49                   95.45865         155.3389
## 25             185                  366.60047         312.6241
## 26             388                  378.65151         486.3034
## 27             777                  553.08859         400.2148
## 28             309                  280.98987         299.6829
## 29             185                  250.99591         276.9495
## 30             165                  539.59353         456.2095
## 31             290                  284.75435         269.8443
## 32              79                  235.69352         252.9406
## 33             218                  179.57711         226.5107
## 34             922                  645.74151         759.7043
x <- 1:34
plot(x, plot.prediksi$Penduduk_Miskin, 
     type = "o",
     xlab = "Amatan",
     ylab = "Jumlah Penduduk Miskin",
     main = "Plot Prediksi dan Data Asli"
     )  
lines(x, plot.prediksi$Prediksi_Negative_Binomial, 
      type = "l", 
      col = "red"
      )  
lines(x, plot.prediksi$Prediksi_Poisson, 
      type = "l", 
      col = "green"
      )    

7. Interpretasi Koefisien Regresi

data.frame(model.nb2$coefficients, exp(model.nb2$coefficients))
##                                  model.nb2.coefficients
## (Intercept)                                1.476341e+00
## Jumlah.Penduduk                            5.948951e-05
## Proporsi.Lapangan.Kerja.Informal           5.852904e-02
## TPT                                        1.800434e-01
## WilayahTengah                             -4.167909e-01
## WilayahTimur                              -8.259177e-01
##                                  exp.model.nb2.coefficients.
## (Intercept)                                        4.3768997
## Jumlah.Penduduk                                    1.0000595
## Proporsi.Lapangan.Kerja.Informal                   1.0602758
## TPT                                                1.1972693
## WilayahTengah                                      0.6591587
## WilayahTimur                                       0.4378330
Variabel Exp (coef) Interpretasi
Jumlah penduduk 1.0000595 Setiap peningkatan 1000 jumlah penduduk, akan meningkatkan jumlah penduduk miskin 1.0000595 kali lipat
Proporsi lapangan kerja informal 1.0602758 Setiap peningkatan 1% proporsi lapangan kerja informal, akan meningkatkan jumlah penduduk miskin 1.0602758 kali lipat.
TPT 1.1972693 Setiap peningkatan 1% tingkat pengangguran terbuka, akan meningkatkan jumlah penduduk miskin 1.1972693 kali lipat.
Wilayah tengah 0.6591587 Peluang daerah yang berada di wilayah tengah untuk mengalami peningkatan penduduk miskin adalah 0.6591587 kali dibanding daerah yang berada di wilayah barat
Wilayah timur 0.4378330 Peluang daerah yang berada di wilayah timur untuk mengalami peningkatan penduduk miskin adalah 0.437833 kali dibanding daerah yang berada di wilayah barat

8. Prediksi data baru

No Jumlah.Penduduk Proporsi.Lapangan.Kerja.Informal TPT Wilayah
1 14798,4 59,07 4,7 Barat
2 6951,2 59,98 4,31 Barat
3 1469,8 67,54 8,02 Barat
4 10576,4 49,63 8,35 Barat
5 34738,2 36,88 3,73 Barat
6 3919,2 63,86 4,84 Tengah
7 2686,3 53,43 3,3 Tengah
8 3664,7 58,77 4,2 Tengah
9 2512,9 44,26 4,62 Tengah
10 8888,8 63,55 6,51 Timur
data.baru <- read_xlsx("Data Poisson.xlsx", sheet = "Data Kemiskinan Baru")
data.baru$Wilayah <- as.factor(data.baru$Wilayah)
data.baru
## # A tibble: 10 × 4
##    Jumlah.Penduduk Proporsi.Lapangan.Kerja.Informal   TPT Wilayah
##              <dbl>                            <dbl> <dbl> <fct>  
##  1          14798.                             59.1  4.7  Barat  
##  2           6951.                             60.0  4.31 Barat  
##  3           1470.                             67.5  8.02 Barat  
##  4          10576.                             49.6  8.35 Barat  
##  5          34738.                             36.9  3.73 Barat  
##  6           3919.                             63.9  4.84 Tengah 
##  7           2686.                             53.4  3.3  Tengah 
##  8           3665.                             58.8  4.2  Tengah 
##  9           2513.                             44.3  4.62 Tengah 
## 10           8889.                             63.6  6.51 Timur
data.frame(predict(model.nb2, newdata = data.baru, type = "response"))
##    predict.model.nb2..newdata...data.baru..type....response..
## 1                                                    780.7106
## 2                                                    481.2674
## 3                                                   1054.4532
## 4                                                    674.3060
## 5                                                    585.8197
## 6                                                    365.6862
## 7                                                    139.8690
## 8                                                    238.2916
## 9                                                    102.6512
## 10                                                   433.0340