1. Package

library(readr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.1
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(DT)
library(DataExplorer)
library(Kendall)
library(vroom)

## 
## Attaching package: 'vroom'
## 
## The following objects are masked from 'package:readr':
## 
##     as.col_spec, col_character, col_date, col_datetime, col_double,
##     col_factor, col_guess, col_integer, col_logical, col_number,
##     col_skip, col_time, cols, cols_condense, cols_only, date_names,
##     date_names_lang, date_names_langs, default_locale, fwf_cols,
##     fwf_empty, fwf_positions, fwf_widths, locale, output_column,
##     problems, spec

library(readxl)
library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(ggplot2)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

2. Deskripsi

Var	Nama	Keterangan
Y	Penduduk Miskin
X1	Jumlah Penduduk
X2	Gini Ratio
X3	Kepadatan
X4	KHL
X5	ProporsI Lapangan Kerja Informal
X6	Proporsi Lapangan Kerja Informal Sektor Non-Pertanian
X7	TPT
X8	IPM
X9	Wilayah Berdasar Zona Waktu

3. Input Data

data <- read_xlsx("Data Poisson.xlsx", sheet = "Data Kemiskinan")
data$Wilayah <- as.factor(data$Wilayah)

str(data)

## tibble [34 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Provinsi                        : chr [1:34] "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
##  $ Penduduk.Miskin                 : num [1:34] 806 1268 335 485 279 ...
##  $ Jumlah.Penduduk                 : num [1:34] 5388 14798 5546 6951 3604 ...
##  $ Gini.Ratio                      : num [1:34] 0.348 0.343 0.329 0.363 0.351 0.358 0.37 0.341 0.248 0.342 ...
##  $ Kepadatan                       : num [1:34] 92 205 133 75 72 93 102 262 90 258 ...
##  $ KHL                             : num [1:34] 1732413 1271058 1474227 1872000 1708174 ...
##  $ Proporsi.Lapangan.Kerja.Informal: num [1:34] 61.5 59.1 65.3 54 60 ...
##  $ TPT                             : num [1:34] 5.97 5.47 6.17 4.4 4.7 4.74 3.39 4.31 4.18 8.02 ...
##  $ IPM                             : num [1:34] 72.8 72.7 73.3 73.5 72.1 ...
##  $ Wilayah                         : Factor w/ 3 levels "Barat","Tengah",..: 1 1 1 1 1 1 1 1 1 1 ...

data.poisson <- data[-1]

str(data.poisson)

## tibble [34 × 9] (S3: tbl_df/tbl/data.frame)
##  $ Penduduk.Miskin                 : num [1:34] 806 1268 335 485 279 ...
##  $ Jumlah.Penduduk                 : num [1:34] 5388 14798 5546 6951 3604 ...
##  $ Gini.Ratio                      : num [1:34] 0.348 0.343 0.329 0.363 0.351 0.358 0.37 0.341 0.248 0.342 ...
##  $ Kepadatan                       : num [1:34] 92 205 133 75 72 93 102 262 90 258 ...
##  $ KHL                             : num [1:34] 1732413 1271058 1474227 1872000 1708174 ...
##  $ Proporsi.Lapangan.Kerja.Informal: num [1:34] 61.5 59.1 65.3 54 60 ...
##  $ TPT                             : num [1:34] 5.97 5.47 6.17 4.4 4.7 4.74 3.39 4.31 4.18 8.02 ...
##  $ IPM                             : num [1:34] 72.8 72.7 73.3 73.5 72.1 ...
##  $ Wilayah                         : Factor w/ 3 levels "Barat","Tengah",..: 1 1 1 1 1 1 1 1 1 1 ...

4. EDA

4.1. Summary data

summary(data.poisson)

##  Penduduk.Miskin  Jumlah.Penduduk     Gini.Ratio       Kepadatan       
##  Min.   :  49.0   Min.   :  708.4   Min.   :0.2480   Min.   :    9.00  
##  1st Qu.: 197.5   1st Qu.: 2360.3   1st Qu.:0.3357   1st Qu.:   54.25  
##  Median : 342.5   Median : 4093.9   Median :0.3495   Median :  103.50  
##  Mean   : 769.0   Mean   : 7929.5   Mean   :0.3573   Mean   :  744.26  
##  3rd Qu.: 812.0   3rd Qu.: 8138.9   3rd Qu.:0.3880   3rd Qu.:  261.00  
##  Max.   :4181.0   Max.   :49565.2   Max.   :0.4480   Max.   :15978.00  
##       KHL          Proporsi.Lapangan.Kerja.Informal      TPT       
##  Min.   : 825000   Min.   :36.32                    Min.   :3.110  
##  1st Qu.:1480618   1st Qu.:53.58                    1st Qu.:3.985  
##  Median :1699587   Median :60.22                    Median :4.775  
##  Mean   :1710832   Mean   :59.95                    Mean   :5.123  
##  3rd Qu.:2014971   3rd Qu.:65.19                    3rd Qu.:5.923  
##  Max.   :2538174   Max.   :84.11                    Max.   :8.530  
##       IPM          Wilayah  
##  Min.   :61.39   Barat :18  
##  1st Qu.:70.23   Tengah:12  
##  Median :72.19   Timur : 4  
##  Mean   :71.97              
##  3rd Qu.:73.22              
##  Max.   :81.65

4.2. Outlier

boxplot(Penduduk.Miskin ~ Wilayah, data = data.poisson,
        main = "Box Plot Jumlah Penduduk Miskin dan Wilayah",
        col = c("lightcyan","lavender", "mistyrose"),
        horizontal = TRUE
        )

boxplot(data$Penduduk.Miskin, 
        horizontal = TRUE)

Terdapat 3 Outlier, lebih baik dihilangkan dari pembentukan model

4.3. Distribusi Poisson

hist(data.poisson$Penduduk.Miskin, probability = TRUE)
lines(density(data.poisson$Penduduk.Miskin), col=2, lwd=2)

Histogram menunjukkan bahwa data variabel respon berdistribusi Poisson

#calculate empirical CDF of data
p = ecdf(data.poisson$Penduduk.Miskin)

#plot CDF
plot(p)

Plot fungsi kepadatan peluang juga menunjukkan bahwa data variabel respon berdistribusi poisson

library(bbmle)

## Loading required package: stats4

## 
## Attaching package: 'bbmle'

## The following object is masked from 'package:dplyr':
## 
##     slice

mle2(Penduduk.Miskin ~ dpois(lambda),
     data=data.poisson,
     start=list(lambda=1))

## Warning in dpois(x = c(806, 1268, 335, 485, 279, 1044, 297, 1002, 66, 151, :
## NaNs produced

## Warning in dpois(x = c(806, 1268, 335, 485, 279, 1044, 297, 1002, 66, 151, :
## NaNs produced

## Warning in dpois(x = c(806, 1268, 335, 485, 279, 1044, 297, 1002, 66, 151, :
## NaNs produced

## 
## Call:
## mle2(minuslogl = Penduduk.Miskin ~ dpois(lambda), start = list(lambda = 1), 
##     data = data.poisson)
## 
## Coefficients:
##   lambda 
## 768.9708 
## 
## Log-likelihood: -16560.95

mean(data.poisson$Penduduk.Miskin)

## [1] 768.9706

library(stats)
ks.test(data.poisson$Penduduk.Miskin, "rpois", lambda=768.9708)

## Warning in ks.test.default(data.poisson$Penduduk.Miskin, "rpois", lambda =
## 768.9708): ties should not be present for the Kolmogorov-Smirnov test

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  data.poisson$Penduduk.Miskin
## D = 835.06, p-value < 2.2e-16
## alternative hypothesis: two-sided

Uji dengan KS.test juga signifikan, sehingga data variabel respon berdistribusi poisson

5. Create Model

5.1. Model Poisson

model.poisson <- glm(Penduduk.Miskin ~ ., data = data.poisson, family = "poisson")
summary(model.poisson)

## 
## Call:
## glm(formula = Penduduk.Miskin ~ ., family = "poisson", data = data.poisson)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       5.602e+00  4.597e-01  12.186  < 2e-16 ***
## Jumlah.Penduduk                   4.437e-05  1.002e-06  44.301  < 2e-16 ***
## Gini.Ratio                       -1.897e+00  2.545e-01  -7.453 9.12e-14 ***
## Kepadatan                         9.583e-05  4.460e-06  21.486  < 2e-16 ***
## KHL                              -5.572e-07  3.839e-08 -14.514  < 2e-16 ***
## Proporsi.Lapangan.Kerja.Informal  4.812e-02  1.795e-03  26.802  < 2e-16 ***
## TPT                               5.791e-02  7.059e-03   8.204 2.33e-16 ***
## IPM                              -1.692e-02  4.836e-03  -3.498 0.000469 ***
## WilayahTengah                    -3.595e-01  2.224e-02 -16.163  < 2e-16 ***
## WilayahTimur                     -5.303e-01  4.069e-02 -13.033  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 32854.7  on 33  degrees of freedom
## Residual deviance:  3083.5  on 24  degrees of freedom
## AIC: 3370.7
## 
## Number of Fisher Scoring iterations: 5

Tidak terpenuhi equidispersi sehingga galat baku underestimate sehingga P-value nya cenderng gampang menolak H0

5.1.1 Multikolinieritas

vif(model.poisson)

##                                      GVIF Df GVIF^(1/(2*Df))
## Jumlah.Penduduk                  9.399541  1        3.065867
## Gini.Ratio                       2.887754  1        1.699339
## Kepadatan                        2.499372  1        1.580940
## KHL                              9.083746  1        3.013925
## Proporsi.Lapangan.Kerja.Informal 6.215647  1        2.493120
## TPT                              3.710630  1        1.926299
## IPM                              6.026537  1        2.454901
## Wilayah                          3.753311  2        1.391886

Tidak terdapat multikolinieritas antar variabel

5.1.2. Test Over dispersi

3083.5/24

## [1] 128.4792

library(AER)

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: survival

dispersiontest(model.poisson)

## 
##  Overdispersion test
## 
## data:  model.poisson
## z = 5.0288, p-value = 2.467e-07
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   88.81769

Terjadi overdispersi pada variabel respon

library(DHARMa)

## This is DHARMa 0.4.6. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')

sim.poisson <- simulateResiduals(model.poisson, refit = TRUE)
testOverdispersion(sim.poisson)

## testOverdispersion is deprecated, switch your code to using the testDispersion function

## 
##  DHARMa nonparametric dispersion test via mean deviance residual fitted
##  vs. simulated-refitted
## 
## data:  simulationOutput
## dispersion = 128.89, p-value < 2.2e-16
## alternative hypothesis: two.sided

5.1.3. Plot Residual

plotSimulatedResiduals(sim.poisson)

## plotSimulatedResiduals is deprecated, please switch your code to simply using the plot() function

5.2. Negative binomial (semua variabel prediktor disertakan)

model.nb <- glm.nb (Penduduk.Miskin~., data = data.poisson)
summary(model.nb)

## 
## Call:
## glm.nb(formula = Penduduk.Miskin ~ ., data = data.poisson, init.theta = 5.242434309, 
##     link = log)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -2.355e-01  4.334e+00  -0.054   0.9567    
## Jumlah.Penduduk                   5.135e-05  1.211e-05   4.239 2.24e-05 ***
## Gini.Ratio                        8.191e-01  2.445e+00   0.335   0.7376    
## Kepadatan                         4.890e-05  4.335e-05   1.128   0.2593    
## KHL                              -2.404e-07  3.732e-07  -0.644   0.5194    
## Proporsi.Lapangan.Kerja.Informal  6.745e-02  1.717e-02   3.929 8.54e-05 ***
## TPT                               1.890e-01  7.943e-02   2.379   0.0173 *  
## IPM                               1.702e-02  4.641e-02   0.367   0.7138    
## WilayahTengah                    -3.769e-01  1.921e-01  -1.962   0.0497 *  
## WilayahTimur                     -6.037e-01  3.591e-01  -1.681   0.0927 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(5.2424) family taken to be 1)
## 
##     Null deviance: 219.864  on 33  degrees of freedom
## Residual deviance:  35.077  on 24  degrees of freedom
## AIC: 474.46
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  5.24 
##           Std. Err.:  1.26 
## 
##  2 x log-likelihood:  -452.465

Beberapa variabel tidak signifikan dan akan dihilangkan dari pembentukan model.

5.3. Binomial negative (hanya variabel yang siginifikan)

model.nb2 <- glm.nb (Penduduk.Miskin~ Jumlah.Penduduk+Proporsi.Lapangan.Kerja.Informal+TPT+Wilayah, data = data.poisson)
summary(model.nb2)

## 
## Call:
## glm.nb(formula = Penduduk.Miskin ~ Jumlah.Penduduk + Proporsi.Lapangan.Kerja.Informal + 
##     TPT + Wilayah, data = data.poisson, init.theta = 4.678079991, 
##     link = log)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       1.476e+00  9.999e-01   1.476  0.13982    
## Jumlah.Penduduk                   5.949e-05  9.035e-06   6.585 4.56e-11 ***
## Proporsi.Lapangan.Kerja.Informal  5.853e-02  1.180e-02   4.961 7.02e-07 ***
## TPT                               1.800e-01  8.316e-02   2.165  0.03039 *  
## WilayahTengah                    -4.168e-01  1.951e-01  -2.136  0.03268 *  
## WilayahTimur                     -8.259e-01  3.136e-01  -2.634  0.00845 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(4.6781) family taken to be 1)
## 
##     Null deviance: 196.429  on 33  degrees of freedom
## Residual deviance:  35.254  on 28  degrees of freedom
## AIC: 470.57
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  4.68 
##           Std. Err.:  1.12 
## 
##  2 x log-likelihood:  -456.571

5.3.1 Test Over dispersi

sim.nb2 <- simulateResiduals(model.nb2, refit = TRUE, n=99)
testOverdispersion(sim.nb2)

## testOverdispersion is deprecated, switch your code to using the testDispersion function

## 
##  DHARMa nonparametric dispersion test via mean deviance residual fitted
##  vs. simulated-refitted
## 
## data:  simulationOutput
## dispersion = 0.89895, p-value = 0.303
## alternative hypothesis: two.sided

Nlai dispersion = 0.8989, mendekati 1 (tidak lagi terjadi overdispersi pada model negatif binomial)

5.3.2. Plot Residual

plotSimulatedResiduals(sim.nb2)

## plotSimulatedResiduals is deprecated, please switch your code to simply using the plot() function

## DHARMa::testOutliers: nBoot > nSim does not make much sense, thus changed to nBoot = nSim. If you want to increase nBoot, increase nSim in DHARMa::simulateResiduals as well.

6. Perbandingan Model

6.1. Perbandingan AIC

Nama.Model <- c("Model.Poisson", "Model.nb", "Model.nb2")
Nilai.AIC <- c(AIC(model.poisson), AIC(model.nb), AIC(model.nb2))
df <- data.frame(Nama.Model, Nilai.AIC)
print(df)

##      Nama.Model Nilai.AIC
## 1 Model.Poisson 3370.7327
## 2      Model.nb  474.4648
## 3     Model.nb2  470.5705

Model negatif binomial dengan seleksi beberapa variabel menunjukan nilai AIC yang lebih kecil

6.2. Plot Prediksi

prediksi.nb <- predict(model.nb, data = data.poisson, type = "response")
prediksi.poisson <- predict(model.poisson, data = data.poisson, type = "response")

plot.prediksi <- data.frame( data.poisson$Penduduk.Miskin, prediksi.nb, prediksi.poisson)

colnames(plot.prediksi) <- c("Penduduk_Miskin", "Prediksi_Negative_Binomial", "Prediksi_Poisson")

plot.prediksi

##    Penduduk_Miskin Prediksi_Negative_Binomial Prediksi_Poisson
## 1              806                  618.14319         542.0508
## 2             1268                  867.59911         942.3193
## 3              335                  885.32820         791.0109
## 4              485                  298.64541         329.0612
## 5              279                  400.22664         440.9737
## 6             1044                  602.09389         562.9618
## 7              297                  512.45751         594.6896
## 8             1002                 1080.70435        1156.9733
## 9               66                  136.23110         233.5222
## 10             151                  146.27898         139.6040
## 11             502                  499.98138         514.0109
## 12            4070                 8155.04949        4761.3663
## 13            3831                 3332.61135        2989.0587
## 14             454                  345.54137         383.2262
## 15            4181                 4593.98018        4366.4813
## 16             814                  696.48920         588.0338
## 17             205                  218.60452         236.5721
## 18             731                  784.76503         744.8363
## 19            1131                  581.12912         779.8757
## 20             350                  402.60861         541.0312
## 21             145                  169.58993         203.6248
## 22             195                  214.80749         278.7523
## 23             236                  140.72886         128.6405
## 24              49                   95.45865         155.3389
## 25             185                  366.60047         312.6241
## 26             388                  378.65151         486.3034
## 27             777                  553.08859         400.2148
## 28             309                  280.98987         299.6829
## 29             185                  250.99591         276.9495
## 30             165                  539.59353         456.2095
## 31             290                  284.75435         269.8443
## 32              79                  235.69352         252.9406
## 33             218                  179.57711         226.5107
## 34             922                  645.74151         759.7043

x <- 1:34
plot(x, plot.prediksi$Penduduk_Miskin, 
     type = "o",
     xlab = "Amatan",
     ylab = "Jumlah Penduduk Miskin",
     main = "Plot Prediksi dan Data Asli"
     )  
lines(x, plot.prediksi$Prediksi_Negative_Binomial, 
      type = "l", 
      col = "red"
      )  
lines(x, plot.prediksi$Prediksi_Poisson, 
      type = "l", 
      col = "green"
      )

7. Interpretasi Koefisien Regresi

data.frame(model.nb2$coefficients, exp(model.nb2$coefficients))

##                                  model.nb2.coefficients
## (Intercept)                                1.476341e+00
## Jumlah.Penduduk                            5.948951e-05
## Proporsi.Lapangan.Kerja.Informal           5.852904e-02
## TPT                                        1.800434e-01
## WilayahTengah                             -4.167909e-01
## WilayahTimur                              -8.259177e-01
##                                  exp.model.nb2.coefficients.
## (Intercept)                                        4.3768997
## Jumlah.Penduduk                                    1.0000595
## Proporsi.Lapangan.Kerja.Informal                   1.0602758
## TPT                                                1.1972693
## WilayahTengah                                      0.6591587
## WilayahTimur                                       0.4378330

Variabel	Exp (coef)	Interpretasi
Jumlah penduduk	1.0000595	Setiap peningkatan 1000 jumlah penduduk, akan meningkatkan jumlah penduduk miskin 1.0000595 kali lipat
Proporsi lapangan kerja informal	1.0602758	Setiap peningkatan 1% proporsi lapangan kerja informal, akan meningkatkan jumlah penduduk miskin 1.0602758 kali lipat.
TPT	1.1972693	Setiap peningkatan 1% tingkat pengangguran terbuka, akan meningkatkan jumlah penduduk miskin 1.1972693 kali lipat.
Wilayah tengah	0.6591587	Peluang daerah yang berada di wilayah tengah untuk mengalami peningkatan penduduk miskin adalah 0.6591587 kali dibanding daerah yang berada di wilayah barat
Wilayah timur	0.4378330	Peluang daerah yang berada di wilayah timur untuk mengalami peningkatan penduduk miskin adalah 0.437833 kali dibanding daerah yang berada di wilayah barat

8. Prediksi data baru

No	Jumlah.Penduduk	Proporsi.Lapangan.Kerja.Informal	TPT	Wilayah
1	14798,4	59,07	4,7	Barat
2	6951,2	59,98	4,31	Barat
3	1469,8	67,54	8,02	Barat
4	10576,4	49,63	8,35	Barat
5	34738,2	36,88	3,73	Barat
6	3919,2	63,86	4,84	Tengah
7	2686,3	53,43	3,3	Tengah
8	3664,7	58,77	4,2	Tengah
9	2512,9	44,26	4,62	Tengah
10	8888,8	63,55	6,51	Timur

data.baru <- read_xlsx("Data Poisson.xlsx", sheet = "Data Kemiskinan Baru")
data.baru$Wilayah <- as.factor(data.baru$Wilayah)
data.baru

## # A tibble: 10 × 4
##    Jumlah.Penduduk Proporsi.Lapangan.Kerja.Informal   TPT Wilayah
##              <dbl>                            <dbl> <dbl> <fct>  
##  1          14798.                             59.1  4.7  Barat  
##  2           6951.                             60.0  4.31 Barat  
##  3           1470.                             67.5  8.02 Barat  
##  4          10576.                             49.6  8.35 Barat  
##  5          34738.                             36.9  3.73 Barat  
##  6           3919.                             63.9  4.84 Tengah 
##  7           2686.                             53.4  3.3  Tengah 
##  8           3665.                             58.8  4.2  Tengah 
##  9           2513.                             44.3  4.62 Tengah 
## 10           8889.                             63.6  6.51 Timur

data.frame(predict(model.nb2, newdata = data.baru, type = "response"))

##    predict.model.nb2..newdata...data.baru..type....response..
## 1                                                    780.7106
## 2                                                    481.2674
## 3                                                   1054.4532
## 4                                                    674.3060
## 5                                                    585.8197
## 6                                                    365.6862
## 7                                                    139.8690
## 8                                                    238.2916
## 9                                                    102.6512
## 10                                                   433.0340

Regresi Poisson Data Kemiskinan

Mochamad Ridwan

2023-05-21