Raziskovalec iz Medicinske fakultete želi razviti model, ki bi predvidel, kateri dejavniki vplivajo na razvoj sladkorne bolezni pri ženskah. Pojasnjevalne spremenljivke so nosečnost, raven glukoze, pritisk, indeks telesne mase in starost. #### odvisna spr je sladkorna bolezen. delamo logistično regresijo, ker je odvisna spremenljivka kategorialna z dvema kategorijama in ne moremo uporabiti klasične regresije.
library(readxl)
podatki <- read_xlsx("./Diabetes.xlsx")
podatki <- as.data.frame(podatki)
head(podatki)
## ID Nosecnost Glukoza Pritisk BMI Starost Diabetes
## 1 1 1 97 64 18.2 21 neg
## 2 2 1 83 68 18.2 27 neg
## 3 3 1 97 70 18.2 21 neg
## 4 4 0 104 76 18.4 27 neg
## 5 5 1 80 55 19.1 21 neg
## 6 6 3 99 80 19.3 30 neg
Opis spremenljivk:
podatki$DiabetesF <- factor(podatki$Diabetes,
levels = c("neg", "poz"),
labels = c("neg", "poz"))
summary(podatki[ , c(-1, -7)])
## Nosecnost Glukoza Pritisk BMI
## Min. :0.000 Min. : 44.0 Min. : 24.0 Min. :18.20
## 1st Qu.:1.000 1st Qu.: 99.0 1st Qu.: 64.0 1st Qu.:27.40
## Median :2.000 Median :115.0 Median : 70.0 Median :32.00
## Mean :2.713 Mean :120.7 Mean : 71.4 Mean :32.39
## 3rd Qu.:4.000 3rd Qu.:139.0 3rd Qu.: 80.0 3rd Qu.:36.58
## Max. :7.000 Max. :199.0 Max. :122.0 Max. :67.10
## Starost DiabetesF
## Min. :21.00 neg:422
## 1st Qu.:23.00 poz:184
## Median :27.00
## Mean :31.12
## 3rd Qu.:36.00
## Max. :70.00
fit0 <- glm(DiabetesF ~ 1,
family = binomial,
data = podatki)
summary(fit0)
##
## Call:
## glm(formula = DiabetesF ~ 1, family = binomial, data = podatki)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.83007 0.08834 -9.396 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 744.06 on 605 degrees of freedom
## Residual deviance: 744.06 on 605 degrees of freedom
## AIC: 746.06
##
## Number of Fisher Scoring iterations: 4
exp(cbind(odds = fit0$coefficients, confint.default(fit0)))
## odds 2.5 % 97.5 %
## (Intercept) 0.436019 0.3666974 0.5184452
head(fitted(fit0))
## 1 2 3 4 5 6
## 0.3036304 0.3036304 0.3036304 0.3036304 0.3036304 0.3036304
Pseudo_R2_fit1 <- 422/606
Pseudo_R2_fit1
## [1] 0.6963696
fit1 <- glm(DiabetesF ~ Nosecnost + Glukoza + Pritisk + BMI + Starost,
family = binomial,
data = podatki)
library(car)
## Loading required package: carData
vif(fit1)
## Nosecnost Glukoza Pritisk BMI Starost
## 1.251598 1.044367 1.202577 1.136284 1.370504
podatki$StdResid <- rstandard(fit1)
podatki$Cook <- cooks.distance(fit1)
hist(podatki$StdResid,
main = "Histogram standardiziranih ostankov",
ylab = "Frekvenca",
xlab = "Standardizirani ostanki")
head(podatki[order(-podatki$Cook), c("ID", "Cook")])
## ID Cook
## 590 590 0.03076887
## 173 173 0.02342203
## 30 30 0.02108623
## 600 600 0.01626678
## 382 382 0.01580275
## 603 603 0.01579207
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
podatki <- podatki %>%
filter(!ID %in% c(590,173,30))
fit0 <- glm(DiabetesF ~ 1,
family = binomial,
data = podatki)
fit1 <- glm(DiabetesF ~ Nosecnost + Glukoza + Pritisk + BMI + Starost,
family = binomial,
data = podatki)
anova(fit0, fit1, test = "Chi")
## Analysis of Deviance Table
##
## Model 1: DiabetesF ~ 1
## Model 2: DiabetesF ~ Nosecnost + Glukoza + Pritisk + BMI + Starost
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 602 740.23
## 2 597 540.34 5 199.88 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit1)
##
## Call:
## glm(formula = DiabetesF ~ Nosecnost + Glukoza + Pritisk + BMI +
## Starost, family = binomial, data = podatki)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.737724 0.883714 -9.888 < 2e-16 ***
## Nosecnost 0.056328 0.055118 1.022 0.30680
## Glukoza 0.036204 0.004063 8.910 < 2e-16 ***
## Pritisk -0.016456 0.009432 -1.745 0.08103 .
## BMI 0.098312 0.017317 5.677 1.37e-08 ***
## Starost 0.034460 0.011363 3.033 0.00242 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 740.23 on 602 degrees of freedom
## Residual deviance: 540.34 on 597 degrees of freedom
## AIC: 552.34
##
## Number of Fisher Scoring iterations: 5
exp(cbind(OR = fit1$coefficients, confint.default(fit1)))
## OR 2.5 % 97.5 %
## (Intercept) 0.0001604186 2.838156e-05 0.0009067201
## Nosecnost 1.0579446319 9.496133e-01 1.1786343241
## Glukoza 1.0368672215 1.028643e+00 1.0451574876
## Pritisk 0.9836786327 9.656616e-01 1.0020317766
## BMI 1.1033071072 1.066489e+00 1.1413962888
## Starost 1.0350609958 1.012263e+00 1.0583723665
podatki$OcenaVer <- fitted(fit1)
head(podatki)
## ID Nosecnost Glukoza Pritisk BMI Starost Diabetes DiabetesF
## 1 1 1 97 64 18.2 21 neg neg
## 2 2 1 83 68 18.2 27 neg neg
## 3 3 1 97 70 18.2 21 neg neg
## 4 4 0 104 76 18.4 27 neg neg
## 5 5 1 80 55 19.1 21 neg neg
## 6 6 3 99 80 19.3 30 neg neg
## StdResid Cook OcenaVer
## 1 -0.2300779 1.288801e-05 0.02389603
## 2 -0.1928436 7.060377e-06 0.01669561
## 3 -0.2210807 1.187044e-05 0.02169811
## 4 -0.2566752 2.310497e-05 0.02977654
## 5 -0.1899148 6.431856e-06 0.01648410
## 6 -0.2763028 2.502278e-05 0.03325264
podatki$Uvrstitev <- ifelse(test = podatki$OcenaVer < 0.50,
yes = "Ne",
no = "Da")
podatki$UvrstitevF <- factor(podatki$Uvrstitev,
levels = c("Ne", "Da"),
labels = c("Ne", "Da"))
rezultat <- table(podatki$DiabetesF, podatki$UvrstitevF)
rezultat
##
## Ne Da
## neg 378 42
## poz 87 96
Pseudo_R2_fit2 <- (rezultat[1, 1] + rezultat[2, 2] )/ nrow(podatki)
Pseudo_R2_fit2
## [1] 0.7860697