Na podlagi podatkov 140 zavarovancev skušamo ugotoviti, kako različni dejavniki vplivajo na njihove letne zdravstvene stroške.
library(readxl)
podatki <- read_xlsx("./Zavarovanje.xlsx")
podatki <- as.data.frame(podatki)
head(podatki)
## ID Stroski Starost Spol BMI Otroci Kadilec
## 1 1 1749 19 Z 32.9 0 Ne
## 2 2 8017 42 Z 25.0 2 Ne
## 3 3 12593 52 Z 46.8 5 Ne
## 4 4 13982 63 M 36.8 0 Ne
## 5 5 11931 58 M 25.2 0 Ne
## 6 6 18972 34 M 25.3 2 Da
Opis spremenljivk
podatki$SpolF <- factor(podatki$Spol,
levels = c("Z", "M"),
labels = c("Z", "M"))
podatki$KadilecF <- factor(podatki$Kadilec,
levels = c("Ne", "Da"),
labels = c("Ne", "Da"))
summary(podatki[ , c(-1, -4, -7)])
## Stroski Starost BMI Otroci
## Min. : 1141 Min. :18.00 Min. :17.80 Min. :0.00
## 1st Qu.: 5700 1st Qu.:31.00 1st Qu.:26.10 1st Qu.:0.00
## Median :10652 Median :43.00 Median :31.30 Median :1.00
## Mean :13789 Mean :42.34 Mean :31.15 Mean :1.25
## 3rd Qu.:14476 3rd Qu.:55.00 3rd Qu.:35.88 3rd Qu.:2.00
## Max. :60021 Max. :64.00 Max. :49.10 Max. :5.00
## SpolF KadilecF
## Z:71 Ne:113
## M:69 Da: 27
##
##
##
##
library(car)
## Loading required package: carData
scatterplotMatrix(podatki[ , c(-1, -4, -7, -8, -9)],
smooth = FALSE)
### prva vrstica mi kaže vse povezave med stroški in 3 pojasnjevalnimi
spremenljivkami. med stroški in starostjo je linearna povezanost. po
diagonali so porazdelitve spremenljivk. stroški so asimetrično v
desno.
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.4.2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(podatki[ , c(-1, -4, -7, -8, -9)]))
## Stroski Starost BMI Otroci
## Stroski 1.00 0.26 0.17 0.10
## Starost 0.26 1.00 0.14 -0.02
## BMI 0.17 0.14 1.00 -0.06
## Otroci 0.10 -0.02 -0.06 1.00
##
## n= 140
##
##
## P
## Stroski Starost BMI Otroci
## Stroski 0.0020 0.0484 0.2607
## Starost 0.0020 0.1088 0.8463
## BMI 0.0484 0.1088 0.5179
## Otroci 0.2607 0.8463 0.5179
fit <- lm(Stroski ~ Starost + BMI + Otroci + SpolF+ KadilecF,
data = podatki)
vif(fit)
## Starost BMI Otroci SpolF KadilecF
## 1.028518 1.024215 1.005644 1.004913 1.011513
mean(vif(fit))
## [1] 1.014961
podatki$StdOst <- round(rstandard(fit), 3)
podatki$CooksD <- round(cooks.distance(fit), 3)
hist(podatki$StdOst,
xlab = "Standardizirani ostanki",
ylab = "Frekvenca",
main = "Histogram standardiziranih ostankov")
### Imamo osamelce. presojamo če so napake normalno porazdeljene.
shapiro.test(podatki$StdOst)
##
## Shapiro-Wilk normality test
##
## data: podatki$StdOst
## W = 0.85836, p-value = 2.888e-10
hist(podatki$CooksD,
xlab = "Cookove razdalje",
ylab = "Frekvenca",
main = "Histrogram Cookovih razdalj")
head(podatki[order(-podatki$CooksD), c("ID", "StdOst", "CooksD")], 6)
## ID StdOst CooksD
## 68 68 3.726 0.149
## 88 88 3.624 0.146
## 60 60 3.265 0.103
## 38 38 3.384 0.077
## 69 69 2.899 0.065
## 27 27 -1.979 0.046
podatkiC <- podatki[podatki$StdOst < 3, ]
nrow(podatkiC)
## [1] 135
fit <- lm(Stroski ~ Starost + BMI + Otroci + SpolF+ KadilecF,
data = podatkiC)
podatkiC$StdOst <- round(rstandard(fit), 3)
podatkiC$StdOcenVred <- scale(fit$fitted.values)
library(car)
scatterplot(y = podatkiC$StdOst, x = podatkiC$StdOcenVred,
ylab = "Standardizirani ostanki ",
xlab = "Standardizirane ocenjene vrednosti",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
### Ali je izpolnjena predpostavka o homoskedastičnosti? Ne. Ali je
izpolnjena linearnost? gledaš scatterplotmatrix. ni krivulj, torej je
linearnost izpolnjena. Gre samo za težavo heteroskedastičnosti.
head(podatkiC[order(-podatkiC$StdOst), c("ID", "StdOst", "CooksD")], 6)
## ID StdOst CooksD
## 69 69 3.914 0.065
## 39 39 2.292 0.014
## 16 16 2.199 0.037
## 72 72 2.136 0.038
## 78 78 1.986 0.029
## 87 87 1.900 0.020
podatkiC <- podatkiC[podatkiC$StdOst < 3, ]
nrow(podatkiC)
## [1] 134
fit <- lm(Stroski ~ Starost + BMI + Otroci + SpolF+ KadilecF,
data = podatkiC)
podatkiC$StdOcenVred <- scale(fit$fitted.values)
library(car)
scatterplot(y = podatkiC$StdOst, x = podatkiC$StdOcenVred,
ylab = "Standardizirani ostanki ",
xlab = "Standardizirane ocenjene vrednosti",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.4.2
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -----------------------------------
## Response : Stroski
## Variables: fitted values of Stroski
##
## Test Summary
## -------------------------------
## DF = 1
## Chi2 = 101.1432
## Prob > Chi2 = 8.556824e-24
summary(fit)
##
## Call:
## lm(formula = Stroski ~ Starost + BMI + Otroci + SpolF + KadilecF,
## data = podatkiC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10769.6 -2081.9 -99.2 2029.1 10829.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13187.52 2254.07 -5.851 3.86e-08 ***
## Starost 252.15 28.00 9.005 2.56e-15 ***
## BMI 333.33 60.91 5.473 2.24e-07 ***
## Otroci 375.29 279.12 1.345 0.181
## SpolFM -391.84 774.77 -0.506 0.614
## KadilecFDa 24467.72 983.29 24.884 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4469 on 128 degrees of freedom
## Multiple R-squared: 0.8433, Adjusted R-squared: 0.8372
## F-statistic: 137.8 on 5 and 128 DF, p-value: < 2.2e-16
library(estimatr)
## Warning: package 'estimatr' was built under R version 4.4.2
fit2 <- lm_robust(Stroski ~ Starost + BMI + Otroci + SpolF+ KadilecF,
se_type = "HC1",
data = podatkiC)
summary(fit2)
##
## Call:
## lm_robust(formula = Stroski ~ Starost + BMI + Otroci + SpolF +
## KadilecF, data = podatkiC, se_type = "HC1")
##
## Standard error type: HC1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) -13187.5 2597.86 -5.0763 1.324e-06 -18327.8 -8047.2
## Starost 252.1 26.59 9.4827 1.759e-16 199.5 304.8
## BMI 333.3 79.46 4.1950 5.060e-05 176.1 490.6
## Otroci 375.3 247.64 1.5155 1.321e-01 -114.7 865.3
## SpolFM -391.8 778.64 -0.5032 6.157e-01 -1932.5 1148.8
## KadilecFDa 24467.7 1728.39 14.1564 5.807e-28 21047.8 27887.6
## DF
## (Intercept) 128
## Starost 128
## BMI 128
## Otroci 128
## SpolFM 128
## KadilecFDa 128
##
## Multiple R-squared: 0.8433 , Adjusted R-squared: 0.8372
## F-statistic: 61.73 on 5 and 128 DF, p-value: < 2.2e-16
podatkiC$OcenVred <- round(fitted.values(fit2), 0)
head(podatkiC[c("ID", "Stroski", "OcenVred")])
## ID Stroski OcenVred
## 1 1 1749 2570
## 2 2 8017 6487
## 3 3 12593 17401
## 4 4 13982 14573
## 5 5 11931 9445
## 6 6 18972 28645