Poisson Non Homogen

LIBRARY

library(readxl)
library(AER)

## Loading required package: car

## Loading required package: carData

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: survival

library(MASS)

LOAD DATASET

data <- read_excel("DATASET BULANAN.xlsx")
data$t <- 1:nrow(data)

str(data)

## tibble [61 × 4] (S3: tbl_df/tbl/data.frame)
##  $ Bulan    : chr [1:61] "01-2019" "02-2019" "03-2019" "04-2019" ...
##  $ Frekuensi: num [1:61] 1941 2113 2058 1688 1807 ...
##  $ Besaran  : num [1:61] 2.21e+10 2.40e+10 2.11e+10 1.84e+10 1.82e+10 ...
##  $ t        : int [1:61] 1 2 3 4 5 6 7 8 9 10 ...

summary(data)

##     Bulan             Frekuensi       Besaran                t     
##  Length:61          Min.   :  34   Min.   :8.939e+08   Min.   : 1  
##  Class :character   1st Qu.: 216   1st Qu.:6.988e+09   1st Qu.:16  
##  Mode  :character   Median : 484   Median :1.334e+10   Median :31  
##                     Mean   :1006   Mean   :1.889e+10   Mean   :31  
##                     3rd Qu.:1835   3rd Qu.:2.317e+10   3rd Qu.:46  
##                     Max.   :3277   Max.   :1.615e+11   Max.   :61

STATISTIK DESKRIPTIF

mean(data$Frekuensi)

## [1] 1006.41

var(data$Frekuensi)

## [1] 770571.6

var(data$Frekuensi)/mean(data$Frekuensi)

## [1] 765.6639

VISUALISASI DATA

par(mfrow=c(2,2))
plot(data$t, data$Frekuensi, type = "l", col = "blue",
     main = "Frekuensi Klaim Bulanan",
     xlab = "Waktu (Bulan)", ylab = "Frekuensi")
points(data$t, data$Frekuensi, pch = 19, col = "blue")

plot(data$t, data$Besaran, type = "l", col = "red",
     main = "Total Besaran Klaim Bulanan",
     xlab = "Waktu (Bulan)", ylab = "Besaran (Rp)")
points(data$t, data$Besaran, pch = 19, col = "red")

hist(data$Frekuensi, breaks = 20, col = "lightblue",
     main = "Distribusi Frekuensi Klaim",
     xlab = "Frekuensi")

hist(data$Besaran, breaks = 20, col = "lightcoral",
     main = "Distribusi Besaran Klaim",
     xlab = "Besaran (Rp)")

par(mfrow=c(1,1))

PEMODELAN POISSON: HOMOGEN VS NON-HOMOGEN

model_homogen <- glm(Frekuensi ~ 1, family = poisson, data = data)
model_linear <- glm(Frekuensi ~ t, family = poisson, data = data)
model_quad <- glm(Frekuensi ~ t + I(t^2), family = poisson, data = data)

Model Homogen (λ konstan)

summary(model_homogen)

## 
## Call:
## glm(formula = Frekuensi ~ 1, family = poisson, data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 6.914145   0.004036    1713   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 47770  on 60  degrees of freedom
## Residual deviance: 47770  on 60  degrees of freedom
## AIC: 48273
## 
## Number of Fisher Scoring iterations: 5

Model Non-Homogen Linear (λ(t) = exp(β₀ + β₁t))

summary(model_linear)

## 
## Call:
## glm(formula = Frekuensi ~ t, family = poisson, data = data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.7444087  0.0067789  1142.4   <2e-16 ***
## t           -0.0316419  0.0002504  -126.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 47770  on 60  degrees of freedom
## Residual deviance: 30332  on 59  degrees of freedom
## AIC: 30837
## 
## Number of Fisher Scoring iterations: 5

Model Non-Homogen Kuadratik (λ(t) = exp(β₀ + β₁t + β₂t²))

summary(model_quad)

## 
## Call:
## glm(formula = Frekuensi ~ t + I(t^2), family = poisson, data = data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.8950563  0.0093752  842.12   <2e-16 ***
## t           -0.0502531  0.0008685  -57.86   <2e-16 ***
## I(t^2)       0.0003414  0.0000152   22.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 47770  on 60  degrees of freedom
## Residual deviance: 29838  on 58  degrees of freedom
## AIC: 30345
## 
## Number of Fisher Scoring iterations: 5

UJI OVERDISPERSION

dispersiontest(model_homogen)

## 
##  Overdispersion test
## 
## data:  model_homogen
## z = 7.9915, p-value = 6.666e-16
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##    753.112

LIKELIHOOD RATIO TEST

Homogen vs Linear

anova(model_homogen, model_linear, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: Frekuensi ~ 1
## Model 2: Frekuensi ~ t
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        60      47770                          
## 2        59      30332  1    17438 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Linear vs Kuadratik

anova(model_linear, model_quad, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: Frekuensi ~ t
## Model 2: Frekuensi ~ t + I(t^2)
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        59      30332                          
## 2        58      29838  1   493.54 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

KRITERIA PEMILIHAN MODEL

data.frame(
  Model = c("Homogen", "Linear", "Kuadratik"),
  AIC = c(AIC(model_homogen), AIC(model_linear), AIC(model_quad)),
  BIC = c(BIC(model_homogen), BIC(model_linear), BIC(model_quad)),
  Deviance = c(deviance(model_homogen), deviance(model_linear), deviance(model_quad))
)

##       Model      AIC      BIC Deviance
## 1   Homogen 48273.46 48275.57 47770.14
## 2    Linear 30837.01 30841.24 30331.69
## 3 Kuadratik 30345.48 30351.81 29838.16

VISUALISASI PERBANDINGAN MODEL

data$fitted_homogen <- fitted(model_homogen)
data$fitted_linear <- fitted(model_linear)
data$fitted_quad <- fitted(model_quad)

plot(data$t, data$Frekuensi, pch = 19, col = "black",
     main = "Perbandingan Model Poisson",
     xlab = "Waktu (Bulan)", ylab = "Frekuensi",
     ylim = range(c(data$Frekuensi, data$fitted_quad)))
lines(data$t, data$fitted_homogen, col = "green", lwd = 2, lty = 2)
lines(data$t, data$fitted_linear, col = "blue", lwd = 2)
lines(data$t, data$fitted_quad, col = "red", lwd = 2)
legend("topright", 
       legend = c("Data", "Homogen", "Linear", "Kuadratik"),
       col = c("black", "green", "blue", "red"),
       pch = c(19, NA, NA, NA),
       lty = c(NA, 2, 1, 1), lwd = 2)

# KOEFISIEN MODEL TERPILIH ## Model Linear

coef(model_linear)

## (Intercept)           t 
##  7.74440866 -0.03164189

Model Kuadratik

coef(model_quad)

##   (Intercept)             t        I(t^2) 
##  7.8950562845 -0.0502530519  0.0003414362

FUNGSI INTENSITAS λ(t)

plot(data$t, fitted(model_quad), type = "l", col = "darkred", lwd = 2,
     main = "Fungsi Intensitas λ(t) - Model Kuadratik",
     xlab = "Waktu (Bulan)", ylab = "λ(t)")
grid()

# DIAGNOSTIK MODEL

par(mfrow=c(2,2))
plot(fitted(model_quad), residuals(model_quad, type = "deviance"),
     xlab = "Fitted Values", ylab = "Deviance Residuals",
     main = "Residual vs Fitted")
abline(h = 0, col = "red", lty = 2)

qqnorm(residuals(model_quad, type = "deviance"))
qqline(residuals(model_quad, type = "deviance"), col = "red")

plot(fitted(model_quad), sqrt(abs(residuals(model_quad, type = "deviance"))),
     xlab = "Fitted Values", ylab = "√|Deviance Residuals|",
     main = "Scale-Location")

plot(cooks.distance(model_quad), type = "h",
     main = "Cook's Distance", xlab = "Observation")
abline(h = 4/nrow(data), col = "red", lty = 2)

par(mfrow=c(1,1))

PREDIKSI 12 BULAN KE DEPAN

n_pred <- 12
new_data <- data.frame(t = (max(data$t) + 1):(max(data$t) + n_pred))

pred <- predict(model_quad, newdata = new_data, type = "response", se.fit = TRUE)

pred_df <- data.frame(
  Bulan = new_data$t,
  Prediksi = pred$fit,
  SE = pred$se.fit,
  Lower = pred$fit - 1.96 * pred$se.fit,
  Upper = pred$fit + 1.96 * pred$se.fit
)

pred_df

##    Bulan Prediksi        SE    Lower    Upper
## 1     62 442.2378  7.498071 427.5416 456.9340
## 2     63 438.9011  7.872309 423.4714 454.3308
## 3     64 435.8872  8.262211 419.6933 452.0811
## 4     65 433.1897  8.668022 416.2004 450.1790
## 5     66 430.8029  9.090080 412.9864 448.6195
## 6     67 428.7220  9.528817 410.0455 447.3985
## 7     68 426.9426  9.984749 407.3725 446.5127
## 8     69 425.4610 10.458477 404.9624 445.9596
## 9     70 424.2742 10.950685 402.8108 445.7375
## 10    71 423.3797 11.462134 400.9139 445.8454
## 11    72 422.7756 11.993667 399.2680 446.2832
## 12    73 422.4609 12.546208 397.8703 447.0514

VISUALISASI PREDIKSI

plot(data$t, data$Frekuensi, pch = 19, col = "black",
     xlim = c(1, max(new_data$t)),
     ylim = range(c(data$Frekuensi, pred_df$Upper)),
     main = "Prediksi Frekuensi Klaim (12 Bulan)",
     xlab = "Waktu (Bulan)", ylab = "Frekuensi")
lines(data$t, fitted(model_quad), col = "blue", lwd = 2)
lines(pred_df$Bulan, pred_df$Prediksi, col = "red", lwd = 2, lty = 2)
lines(pred_df$Bulan, pred_df$Lower, col = "red", lwd = 1, lty = 3)
lines(pred_df$Bulan, pred_df$Upper, col = "red", lwd = 1, lty = 3)
polygon(c(pred_df$Bulan, rev(pred_df$Bulan)),
        c(pred_df$Lower, rev(pred_df$Upper)),
        col = rgb(1, 0, 0, 0.2), border = NA)
legend("topleft", 
       legend = c("Data", "Fitted", "Prediksi", "95% CI"),
       col = c("black", "blue", "red", "red"),
       pch = c(19, NA, NA, NA),
       lty = c(NA, 1, 2, 3), lwd = 2)

# MODEL NEGATIVE BINOMIAL

nb_model <- glm.nb(Frekuensi ~ t + I(t^2), data = data)
summary(nb_model)

## 
## Call:
## glm.nb(formula = Frekuensi ~ t + I(t^2), data = data, init.theta = 1.443907116, 
##     link = log)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  8.0770472  0.3305842  24.433  < 2e-16 ***
## t           -0.0688446  0.0246103  -2.797  0.00515 ** 
## I(t^2)       0.0006475  0.0003848   1.683  0.09242 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.4439) family taken to be 1)
## 
##     Null deviance: 93.808  on 60  degrees of freedom
## Residual deviance: 67.724  on 58  degrees of freedom
## AIC: 951.01
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.444 
##           Std. Err.:  0.238 
## 
##  2 x log-likelihood:  -943.011

PERBANDINGAN AIC: POISSON VS NEGATIVE BINOMIAL

data.frame(
  Model = c("Poisson", "Negative Binomial"),
  AIC = c(AIC(model_quad), AIC(nb_model))
)

##               Model        AIC
## 1           Poisson 30345.4778
## 2 Negative Binomial   951.0115

VISUALISASI PERBANDINGAN

data$fitted_nb <- fitted(nb_model)
plot(data$t, data$Frekuensi, pch = 19, col = "black",
     main = "Poisson vs Negative Binomial",
     xlab = "Waktu (Bulan)", ylab = "Frekuensi")
lines(data$t, fitted(model_quad), col = "blue", lwd = 2)
lines(data$t, data$fitted_nb, col = "purple", lwd = 2, lty = 2)
legend("topright", 
       legend = c("Data", "Poisson", "Negative Binomial"),
       col = c("black", "blue", "purple"),
       pch = c(19, NA, NA),
       lty = c(NA, 1, 2), lwd = 2)