Dane wejściowe mają format pliku xlsx i zawierają 8 arkuszy: arkusz z monitoring sites coordinates oraz 7 arkuszy nazwanych od nazwy zmiennej zależnej, zawierających wartości kolejnych zmiennych objaśniających w kolumnach.

KROK 1: wczytanie danych i opis wizualny

Sekcja skopiowana bezpośrednio z markdown, którym realizowałam zajęcia z ekonometrii przestrzennej; służy wizualnemu opisowi danych.

Pakiety

pkgs = c(
  "readxl",
  "dplyr",
  "stringr",
  "ggplot2",
  "knitr",
  "sf",
  "rnaturalearth"
)
invisible(lapply(pkgs, library, character.only = TRUE))

Ustawienia kolorów i motywu wykresów

# kolorki
COL_BG      = "#F7F7FB"
COL_DARK    = "#2E3440"
COL_GREY    = "#D8DEE9"
COL_PURPLE_DARK = "#3B0F70"
COL_PURPLE      = "#8C2981"
COL_PINK        = "#DE4968"
COL_PEACH       = "#FE9F6D"
COL_YELLOW      = "#FCFDBF"
PAL_CONT = c(
  COL_PURPLE_DARK,
  COL_PURPLE,
  COL_PINK,
  COL_PEACH,
  COL_YELLOW
)
SCALE_CONT_FILL = scale_fill_gradientn(colours = PAL_CONT, name = NULL)
SCALE_CONT_COL  = scale_color_gradientn(colours = PAL_CONT, name = NULL)
pal_discrete = function(n) {
  base = c(COL_PURPLE_DARK, COL_PURPLE, COL_PINK, COL_PEACH, COL_YELLOW)
  if (n <= length(base)) return(base[1:n])
  grDevices::colorRampPalette(base)(n)
}
theme_spojny = theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", color = COL_DARK),
    plot.subtitle = element_text(color = COL_DARK),
    legend.title = element_text(face = "bold", color = COL_DARK),
    legend.text  = element_text(color = COL_DARK),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey92"),
    strip.text = element_text(face = "bold", color = COL_DARK)
  )

Wczytanie arkuszy

PATH = "C:/Users/majed/OneDrive/Pulpit/smieci.xlsx"
sheet_coords = "Site coordinates"
sheets_all   = readxl::excel_sheets(PATH)
sheets_cat   = sheets_all[sheets_all != sheet_coords]

coords = readxl::read_excel(PATH, sheet = sheet_coords) %>%
  dplyr::rename(site_id = `Monitoring site ID`) %>%
  dplyr::mutate(
    site_id = as.integer(site_id),
    lat = as.numeric(lat),
    lon = as.numeric(lon)
  )

litter_list = list()
for (sh in sheets_cat) {
  df = readxl::read_excel(PATH, sheet = sh)
  y_col = names(df)[stringr::str_detect(names(df), "^Y_")]
  
  df2 = df %>%
    dplyr::rename(site_id = `Monitoring site ID`, y = dplyr::all_of(y_col)) %>%
    dplyr::mutate(
      site_id = as.integer(site_id),
      y = as.numeric(y),
      category_sheet = sh,
      category = sh %>%
        stringr::str_remove("^\\d+_") %>%
        stringr::str_replace_all("_", " ") %>%
        stringr::str_trim()
    ) %>%
    dplyr::left_join(coords, by = "site_id")
  
  litter_list[[sh]] = df2
}
litter_long = dplyr::bind_rows(litter_list)

Statystyki opisowe

tab_summary = litter_long %>%
  dplyr::group_by(category) %>%
  dplyr::summarise(
    n_sites   = sum(!is.na(y)),
    total     = sum(y, na.rm = TRUE),
    mean      = mean(y, na.rm = TRUE),
    median    = median(y, na.rm = TRUE),
    sd        = sd(y, na.rm = TRUE),
    min       = min(y, na.rm = TRUE),
    max       = max(y, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  dplyr::arrange(desc(mean))


knitr::kable(
  tab_summary,
  digits = 2,
  col.names = c("Typ śmieci", "Liczba obszarów monitoringu", "Suma",
                "Średnia", "Mediana", "SD", "Min", "Max"),
  caption = "Statystyki opisowe"
)
Statystyki opisowe
Typ śmieci Liczba obszarów monitoringu Suma Średnia Mediana SD Min Max
Cigarette butts 44 4783 108.70 67.5 114.62 2 514
Plastic ropes 44 588 13.36 8.5 14.53 1 70
Styrofoam and polystyrene 44 506 11.50 6.0 16.81 0 103
Plastic bags 44 450 10.23 7.0 9.35 0 36
Plastic bottle caps 44 351 7.98 4.0 11.85 0 66
SUP dishes and cutlery 44 170 3.86 3.0 4.04 0 19
Beverage containers 44 66 1.50 1.0 1.77 0 7

Rozmieszczenie przestrzenne

latvia_sf_4326 = rnaturalearth::ne_countries(
  country = "Latvia", scale = "medium", returnclass = "sf"
) %>% sf::st_make_valid() %>% sf::st_transform(4326)
litter_sf_4326 = litter_long %>%
  dplyr::filter(!is.na(lat), !is.na(lon)) %>%
  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326)
ggplot() +
  geom_sf(
    data = latvia_sf_4326,
    fill = COL_BG,
    color = COL_DARK,
    linewidth = 0.35
  ) +
  geom_sf(
    data = litter_sf_4326,
    aes(size = y, color = y),
    alpha = 0.8
  ) +
  facet_wrap(~ category) +
  scale_size_continuous(
    name = "Liczba jednostek",
    range = c(1.2, 6)
  ) +
  scale_color_gradientn(
    colours = PAL_CONT,
    name = "Liczba jednostek",
    trans = "sqrt"
  ) +
  labs(
    title = "Rozmieszczenie przestrzenne obszarów monitoringu i jednostek śmieci"
  ) +
  theme_spojny +
  theme(
    legend.position = "bottom"
  )

#Rozkład zmiennych ciągłych przed i po zlogarytmowaniu

res = as.numeric(litter_list[[1]][[14]])
tur = as.numeric(litter_list[[1]][[15]])

hist_ciagle = rbind(
  data.frame(zmienna = "LocalResidents", wartosc = res),
  data.frame(zmienna = "ln(LocalResidents)",     wartosc = log(res)),
  data.frame(zmienna = "TouristNights",  wartosc = tur),
  data.frame(zmienna = "ln(TouristNights)",      wartosc = log(tur))
)
hist_ciagle$zmienna = factor(hist_ciagle$zmienna, levels = c(
  "LocalResidents", "ln(LocalResidents)",
  "TouristNights",  "ln(TouristNights)"))

ggplot(hist_ciagle, aes(x = wartosc)) +
  geom_histogram(bins = 15, fill = "lightblue", color = "white", linewidth = 0.5) +
  facet_wrap(~ zmienna, scales = "free", ncol = 2) +
  labs(
    title = "Rozkład zmiennych ciągłych przed i po logarytmizacji"
  ) +
  theme_spojny

KROK 2: kod właściwy (modele)

Estymowane modele: Poisson, NB oraz ich zero-inflated odpowiedniki (dla niektórych typów śmieci udział obserwacji zerowych wynosi około 30%).

Pakiety

library("readxl")
library("MASS")
library("pscl")
library("stargazer")
library("lmtest")
library("systemfit")

Przygotowanie danych

Każda kolumna ze zmienną objaśniającą ma nazewnictwo na wzór X1_"nazwa zmiennej". Dla wygody kodowania modeli skracam nazwy do X1-X14, a pełne etykiety wrzucam na wydrukach

# etykietki dla kolumn - lekko skracam nazwy zeby miescily sie na wydrukach
nazwy_litter = c("Cigarette buds","Plastic bags","Bottle caps",
                 "Beverage cont.","SUP","Plastic ropes","Styrofoam")

litter_data = list()
for (i in 1:7) {
  dane = as.data.frame(litter_list[[i]])[, 2:16]
  names(dane) = c("y","X1","X2","X3","X4","X5","X6","X7","X8","X9",
                  "X10","X11","X12","X13","X14")
  dane = na.omit(dane)
  dane$lnX12 = log(dane$X12)    # liczba mieszkancow - logarytm
  dane$lnX13 = log(dane$X13)    # noclegi turystyczne - logarytm
  litter_data[[i]] = dane
}

# etykietki pod wydruki
etykiety = c("BathingZoneManag","BlueFlag","GarbageBin","BeachCleanup",
             "BeachCafe","CoastCafe","Accommod","Parking","Shops",
             "PublTransport","BeachUseCateg","ln(LocalResidents)",
             "ln(TouristNights2024)","LVSeaBasin")

# dane po arkuszach i typach smieci
d1 = litter_data[[1]]   # Cigarette butts
d2 = litter_data[[2]]   # Plastic bags
d3 = litter_data[[3]]   # Plastic bottle caps
d4 = litter_data[[4]]   # Beverage containers
d5 = litter_data[[5]]   # SUP dishes and cutlery
d6 = litter_data[[6]]   # Plastic ropes
d7 = litter_data[[7]]   # Styrofoam and polystyrene

Testy Multikolinearności

Macierz korelacji zmiennych objaśniających

library(ggcorrplot)

X_df = d1[, c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10",
              "X11","lnX12","lnX13","X14")]

names(X_df) = etykiety

corr_mat = cor(X_df, use = "pairwise.complete.obs")

ggcorrplot(
  corr_mat, type = "lower", lab = TRUE, lab_size = 2.6,
  colors = c(COL_PURPLE_DARK, "white", COL_PINK), outline.color = "white"
) +
  labs(title = "Macierz korelacji X") +
  theme_spojny +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

VIF

library(car)

vif_mod = glm(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14,
              family = poisson, data = d1)

vif_tab = data.frame(
  Zmienna = etykiety,
  VIF = round(as.numeric(car::vif(vif_mod)), 2)
) %>%
  dplyr::arrange(desc(VIF))

knitr::kable(
  vif_tab,
  caption = "VIF"
)
VIF
Zmienna VIF
BeachUseCateg 7.90
ln(LocalResidents) 7.45
ln(TouristNights2024) 7.40
BeachCleanup 6.07
BlueFlag 4.38
BathingZoneManag 3.91
CoastCafe 3.17
BeachCafe 2.97
PublTransport 2.81
Accommod 2.64
GarbageBin 2.37
Shops 1.91
LVSeaBasin 1.82
Parking 1.41

Poisson

pois1 = glm(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, family=poisson, data=d1)
pois2 = glm(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, family=poisson, data=d2)
pois3 = glm(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, family=poisson, data=d3)
pois4 = glm(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, family=poisson, data=d4)
pois5 = glm(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, family=poisson, data=d5)
pois6 = glm(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, family=poisson, data=d6)
pois7 = glm(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, family=poisson, data=d7)

stargazer(pois1, pois2, pois3, pois4, pois5, pois6, pois7,
          type="html", column.labels=nazwy_litter, model.numbers=FALSE,
          covariate.labels=etykiety)
Dependent variable:
y
Cigarette buds Plastic bags Bottle caps Beverage cont. SUP Plastic ropes Styrofoam
BathingZoneManag 0.384*** -0.307* -0.349* 0.646 0.789*** -0.300* -0.516***
(0.060) (0.186) (0.204) (0.430) (0.264) (0.160) (0.186)
BlueFlag -1.147*** -1.290*** -1.514*** -1.302* -0.095 -0.240 0.215
(0.075) (0.257) (0.293) (0.787) (0.350) (0.246) (0.258)
GarbageBin 1.477*** 1.003*** 1.696*** -0.904* 0.745*** 0.198 -0.272
(0.055) (0.167) (0.195) (0.467) (0.270) (0.127) (0.175)
BeachCleanup -0.110* -0.239 0.467** -0.012 -0.412* -0.384*** 0.576***
(0.056) (0.157) (0.186) (0.319) (0.233) (0.104) (0.125)
BeachCafe 0.744*** 1.345*** 1.013*** 1.429* 0.502* 1.124*** 1.058***
(0.055) (0.207) (0.265) (0.793) (0.304) (0.233) (0.252)
CoastCafe 0.033 -0.590*** -0.773*** -0.664* 0.399 -0.106 0.575***
(0.053) (0.158) (0.204) (0.391) (0.246) (0.129) (0.162)
Accommod -0.204*** -0.537*** -0.600*** 1.410*** -0.001 0.411*** -0.874***
(0.051) (0.147) (0.172) (0.497) (0.247) (0.126) (0.137)
Parking 0.357*** 0.544** -0.226 -0.302 0.896** 0.822*** -0.671***
(0.076) (0.220) (0.216) (0.589) (0.385) (0.187) (0.208)
Shops -0.177*** -0.184 0.441** -0.769** 0.447** -0.857*** 0.644***
(0.040) (0.136) (0.174) (0.380) (0.227) (0.119) (0.137)
PublTransport 0.486*** 0.435*** 0.483** 0.642 -0.530* 0.164 -0.413***
(0.052) (0.166) (0.211) (0.435) (0.286) (0.137) (0.159)
BeachUseCateg -0.308*** 0.541*** -0.271 0.693 -1.175*** 0.218 0.541**
(0.069) (0.195) (0.211) (0.578) (0.323) (0.173) (0.211)
ln(LocalResidents) 0.185*** -0.034 -0.182** -0.079 0.256** 0.073 -0.482***
(0.025) (0.073) (0.089) (0.199) (0.125) (0.061) (0.071)
ln(TouristNights2024) -0.087** -0.006 0.109 -0.783** 0.258 -0.294*** 0.019
(0.039) (0.117) (0.143) (0.378) (0.182) (0.103) (0.116)
LVSeaBasin 0.061 -0.277** -1.022*** -0.049 -0.138 0.209* 0.092
(0.044) (0.130) (0.159) (0.308) (0.192) (0.108) (0.126)
Constant 3.403*** 1.972* 3.255** 7.784** -1.815 4.316*** 4.511***
(0.332) (1.073) (1.360) (3.275) (1.618) (0.933) (1.076)
Observations 44 44 44 44 44 44 44
Log Likelihood -639.646 -152.947 -160.862 -66.099 -102.788 -241.218 -220.566
Akaike Inf. Crit. 1,309.292 335.894 351.725 162.197 235.576 512.437 471.133
Note: p<0.1; p<0.05; p<0.01

NB

nb1 = glm.nb(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, data=d1)
nb2 = glm.nb(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, data=d2)
nb3 = glm.nb(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, data=d3)
nb4 = glm.nb(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, data=d4)
nb5 = glm.nb(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, data=d5)
nb6 = glm.nb(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, data=d6)
nb7 = glm.nb(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, data=d7)

stargazer(nb1, nb2, nb3, nb4, nb5, nb6, nb7,
          type="html", column.labels=nazwy_litter, model.numbers=FALSE,
          covariate.labels=etykiety)
Dependent variable:
y
Cigarette buds Plastic bags Bottle caps Beverage cont. SUP Plastic ropes Styrofoam
BathingZoneManag 0.565* -0.050 0.264 0.651 0.759** -0.016 -0.415
(0.298) (0.331) (0.418) (0.510) (0.376) (0.364) (0.371)
BlueFlag -1.131*** -1.278** -1.694** -1.317 -0.434 -0.288 -0.008
(0.429) (0.521) (0.665) (0.887) (0.545) (0.537) (0.538)
GarbageBin 1.379*** 1.063*** 1.563*** -0.923* 0.691* 0.122 -0.427
(0.282) (0.313) (0.401) (0.542) (0.368) (0.337) (0.356)
BeachCleanup -0.071 -0.355 -0.100 -0.030 -0.420 -0.451* 0.405
(0.230) (0.270) (0.350) (0.380) (0.310) (0.269) (0.273)
BeachCafe 0.779* 1.293*** 0.978 1.488 0.636 0.960* 1.025*
(0.438) (0.482) (0.617) (0.907) (0.514) (0.546) (0.549)
CoastCafe -0.041 -0.430 -0.561 -0.614 0.241 0.138 0.469
(0.263) (0.285) (0.375) (0.461) (0.339) (0.317) (0.321)
Accommod -0.150 -0.647** -0.469 1.356** 0.088 0.525 -1.158***
(0.267) (0.288) (0.355) (0.558) (0.342) (0.323) (0.320)
Parking 0.492 0.581 -0.055 -0.450 0.812 0.891** -0.323
(0.372) (0.405) (0.489) (0.687) (0.501) (0.450) (0.461)
Shops -0.376 -0.012 0.308 -0.795* 0.374 -0.712** 0.212
(0.262) (0.277) (0.363) (0.472) (0.329) (0.316) (0.323)
PublTransport 0.394 0.480 0.275 0.688 -0.512 -0.018 -0.144
(0.298) (0.327) (0.427) (0.530) (0.395) (0.355) (0.358)
BeachUseCateg -0.364 0.324 -0.240 0.741 -1.038** -0.048 0.855*
(0.356) (0.385) (0.484) (0.674) (0.458) (0.428) (0.443)
ln(LocalResidents) 0.221* -0.002 0.096 -0.112 0.294* 0.229 -0.424***
(0.129) (0.139) (0.177) (0.232) (0.171) (0.155) (0.155)
ln(TouristNights2024) 0.014 -0.053 -0.147 -0.756* 0.190 -0.506** 0.024
(0.202) (0.220) (0.279) (0.423) (0.257) (0.246) (0.243)
LVSeaBasin 0.422* -0.208 -1.119*** -0.065 -0.001 0.093 -0.242
(0.217) (0.246) (0.320) (0.370) (0.278) (0.262) (0.267)
Constant 1.384 2.576 4.760* 7.838** -1.671 6.015*** 4.243*
(1.851) (2.024) (2.600) (3.747) (2.330) (2.256) (2.270)
Observations 44 44 44 44 44 44 44
Log Likelihood -223.086 -131.796 -120.435 -65.871 -97.892 -149.346 -137.706
theta 2.901*** (0.648) 3.426*** (1.116) 2.078*** (0.624) 3.890 (3.269) 3.594** (1.635) 2.299*** (0.567) 2.421*** (0.609)
Akaike Inf. Crit. 476.173 293.592 270.870 161.743 225.783 328.692 305.412
Note: p<0.1; p<0.05; p<0.01

Testowanie: Poisson vs NB

H0: brak naddyspersji. Odrzucenie H0 -> NB.

modele_nb   = list(nb1, nb2, nb3, nb4, nb5, nb6, nb7)
modele_pois = list(pois1, pois2, pois3, pois4, pois5, pois6, pois7)

lr_tab = data.frame()
for (i in 1:7) {
  test = lrtest(modele_nb[[i]], modele_pois[[i]])
  p = test[["Pr(>Chisq)"]][2]
  lr_tab = rbind(lr_tab, data.frame(
    litter      = nazwy_litter[i],
    logLik_NB   = round(test[["LogLik"]][1], 3),
    logLik_Pois = round(test[["LogLik"]][2], 3),
    Chisq       = round(test[["Chisq"]][2], 3),
    df          = abs(test[["Df"]][2]),
    p_value     = format.pval(p, digits = 3, eps = 1e-16),
    result       = ifelse(p < 0.05, "NB", "Poisson")
  ))
}

knitr::kable(
  lr_tab,
  col.names = c("Typ śmieci", "logLik NB", "logLik Poisson",
                "Chi-sq.", "df", "p-val", ""),
  caption = "Test ilorazu wiarygodności: Poisson vs NB"
)
Test ilorazu wiarygodności: Poisson vs NB
Typ śmieci logLik NB logLik Poisson Chi-sq. df p-val
Cigarette buds -222.086 -639.646 835.120 1 <1e-16 NB
Plastic bags -130.796 -152.947 44.302 1 2.81e-11 NB
Bottle caps -119.435 -160.862 82.855 1 <1e-16 NB
Beverage cont. -64.871 -66.099 2.454 1 0.117 Poisson
SUP -96.892 -102.788 11.793 1 0.000595 NB
Plastic ropes -148.346 -241.218 185.745 1 <1e-16 NB
Styrofoam -136.706 -220.566 167.721 1 <1e-16 NB

Dla wszystkich zmiennych zależnych z wyjątkiem Beverage containers wygrywa model ujemny dwumianowy. Beverage containers jest dość specyficzny - aż 39% obserwacji zerowych.

Coef plot dla NB (6/7) i Poissona (Beverage Containers)

library(broom)

#jaki model jakie y
modele    = list(nb1, nb2, nb3, pois4, nb5, nb6, nb7)
zmienne_x = c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10",
              "X11","lnX12","lnX13","X14")

# coef + ci extrakt z każdego modelu
coefy = data.frame()
for (i in 1:7) {
  est = broom::tidy(modele[[i]], conf.int = TRUE)
  est = est[est$term %in% zmienne_x, ]
  est$litter = nazwy_litter[i]
  coefy = rbind(coefy, est)
}

# etykietki zmiennych, istotność 5%
coefy$zmienna   = factor(etykiety[match(coefy$term, zmienne_x)], levels = rev(etykiety))
coefy$litter    = factor(coefy$litter, levels = nazwy_litter)
coefy$istotnosc = ifelse(coefy$p.value < 0.05, "istotne (5%)", "nieistotne")

ggplot(coefy, aes(x = estimate, y = zmienna, color = istotnosc)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = COL_DARK) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.25) +
  geom_point(size = 2) +
  facet_wrap(~ litter, scales = "free_x", ncol = 4) +
  scale_color_manual(
    values = c("istotne (5%)" = COL_PURPLE, "nieistotne" = COL_PEACH),
    name = NULL
  ) +
  labs(x = NULL, y = NULL) +
  theme_spojny +
  theme(legend.position = "bottom")

Fitted vs Observed

pred_df = data.frame()
for (i in 1:7) {
  m = modele[[i]]
  pred_df = rbind(pred_df, data.frame(
    litter      = nazwy_litter[i],
    obserwowane = m$y,
    dopasowane  = fitted(m)
  ))
}
pred_df$litter = factor(pred_df$litter, levels = nazwy_litter)

ggplot(pred_df, aes(x = dopasowane, y = obserwowane)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = COL_DARK) +
  geom_point(color = COL_PURPLE, alpha = 0.7) +
  facet_wrap(~ litter, scales = "free", ncol = 4) +
  labs(title = "Fitted vs Observed",
       x = "Fitted", y = "Observed") +
  theme_spojny

Czy modele zero-inflated mają sens?

dla beverage containers - porównujemy z poissonem

  1. Zero test (H0: Poisson; H1: ZIP) (https://search.r-project.org/CRAN/refmans/vcdExtra/html/zero.test.html)
library(vcdExtra) 

zt_tab = data.frame()

for (i in 4) {
  invisible(capture.output(zt <- zero.test(litter_data[[4]]$y)))
  zt_tab = rbind(zt_tab, data.frame(
    litter      = nazwy_litter[i],
    chi2        = round(zt$statistic, 2),
    df          = zt$df,
    p_value     = zt$prob
  ))
}

knitr::kable(
  zt_tab,
  col.names = c("Typ śmieci", "Chi-sq.", "df", "p-value"),
  caption = "Test van den Broeka: zero inflation względem rozkładu Poissona"
)
Test van den Broeka: zero inflation względem rozkładu Poissona
Typ śmieci Chi-sq. df p-value
Beverage cont. 11.88 1 0.0005665

zero inflated poisson sugerowany

  1. Porównanie zer obserwowanych i oczekiwanych wg Poissona (https://r-statistics.co/Zero-Inflated-Distributions-in-R.html#:~:text=The%20data%20has%20308%20zeros%2C%20but%20a%20plain,hurdle%20model%20rather%20than%20a%20vanilla%20Poisson%20fit.)
zi_test_pois = data.frame()
for (i in 4) {
  dane_test = litter_data[[i]]
  pois = glm(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14,
             family = poisson, data = dane_test)
  zera_obs = sum(dane_test$y == 0)                       # ile zer jest
  zera_exp = sum(dpois(0, fitted(pois)))                 # zera oczekiwane wg Poissona
  zi_test_pois = rbind(zi_test_pois, data.frame(
    litter   = nazwy_litter[i],
    zera_obs = round(zera_obs, 3),
    proc_zer = round(100 * zera_obs / nrow(dane_test), 3),
    zera_exp = round(zera_exp, 3),
    diff     = round(zera_obs - zera_exp, 3)
  ))
}
knitr::kable(
  zi_test_pois,
  col.names = c("Typ śmieci", "Zera obserwowane", "Udział zer", "Zera oczekiwane", "Różnica"),
  caption = "Zera obserwowane vs oczekiwane wg Poissona"
)
Zera obserwowane vs oczekiwane wg Poissona
Typ śmieci Zera obserwowane Udział zer Zera oczekiwane Różnica
Beverage cont. 17 38.636 14.413 2.587

potwierdzenie wniosków z zero.test

Dla reszty - porównanie z NB:

  1. Brak odpowiednika NB zero.test

  2. Porównanie zer obserwowanych i oczekiwanych wg NB (https://r-statistics.co/Zero-Inflated-Distributions-in-R.html#:~:text=The%20data%20has%20308%20zeros%2C%20but%20a%20plain,hurdle%20model%20rather%20than%20a%20vanilla%20Poisson%20fit.)

zi_test = data.frame()
for (i in c(1, 2, 3, 5, 6, 7)) {
  dane_test = litter_data[[i]]
  nb = glm.nb(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14,
              data=dane_test)
  zera_obs = sum(dane_test$y == 0)                              # ile zer jest
  zera_exp = sum(dnbinom(0, mu = fitted(nb), size = nb$theta))  # zera oczekiwane wg NB
  zi_test = rbind(zi_test, data.frame(
    litter   = nazwy_litter[i],
    zera_obs = round(zera_obs, 3),
    proc_zer = round(100 * zera_obs / nrow(dane_test), 3),
    zera_exp = round(zera_exp, 3),
    diff     = round(zera_obs - zera_exp, 3)
  ))
}
knitr::kable(
  zi_test,
  col.names = c("Typ śmieci", "Zera obserwowane", "Udział zer", "Zera oczekiwane", "Różnica"),
  caption = "Zera obserwowane vs oczekiwane wg modelu NB"
)
Zera obserwowane vs oczekiwane wg modelu NB
Typ śmieci Zera obserwowane Udział zer Zera oczekiwane Różnica
Cigarette buds 0 0.000 0.027 -0.027
Plastic bags 3 6.818 1.930 1.070
Bottle caps 6 13.636 5.502 0.498
SUP 5 11.364 5.876 -0.876
Plastic ropes 0 0.000 1.054 -1.054
Styrofoam 1 2.273 1.962 -0.962

brak wskazań do modelu ZINB

zero-inflated Poisson (Beverage Containers)

zip4 = zeroinfl(y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14|
                  X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+lnX12+lnX13+X14, data=d4, dist="poisson")
stargazer(zip4,
          type="html", column.labels="Beverage cont.", model.numbers=FALSE,
          covariate.labels=etykiety)
Dependent variable:
y
Beverage cont.
BathingZoneManag 0.594
BlueFlag -0.623
GarbageBin 0.141
BeachCleanup -0.588
BeachCafe 1.613
CoastCafe 0.028
Accommod -0.084
Parking 0.916
Shops -0.612
PublTransport -0.390
BeachUseCateg -0.121
ln(LocalResidents) -0.176
ln(TouristNights2024) -0.615
LVSeaBasin 0.290
Constant 9.305
Observations 44
Log Likelihood -48.227
Note: p<0.1; p<0.05; p<0.01

Wyniki modelu ZIP nieistotne statystycznie. dodatkowo - mało obserwacji - osobliwa macierz wariancji kowariancji - standard errors niepoliczalne