#using the pima dataset
data(pima, package="faraway")
b <- factor(pima$test)
head(pima) %>% kable()
pregnant | glucose | diastolic | triceps | insulin | bmi | diabetes | age | test |
---|---|---|---|---|---|---|---|---|
6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 |
N.B. This is for demonstration purposes only. Analysis of the output shows that this is an extremely Poor Fit.
#train a model which fits b with all variables
m <- glm(b ~ ., family=binomial, data=pima)
## Warning: glm.fit: algorithm did not converge
summary(m)
##
## Call:
## glm(formula = b ~ ., family = binomial, data = pima)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.409e-06 -2.409e-06 -2.409e-06 2.409e-06 2.409e-06
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.657e+01 8.091e+04 0.000 1.000
## pregnant -2.368e-12 4.613e+03 0.000 1.000
## glucose -3.396e-13 4.967e+02 0.000 1.000
## diastolic -2.501e-13 7.261e+02 0.000 1.000
## triceps -9.012e-13 9.897e+02 0.000 1.000
## insulin 1.013e-13 1.334e+02 0.000 1.000
## bmi 8.525e-13 1.906e+03 0.000 1.000
## diabetes -4.014e-11 4.037e+04 0.000 1.000
## age 5.410e-13 1.381e+03 0.000 1.000
## test 5.313e+01 3.230e+04 0.002 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9.9348e+02 on 767 degrees of freedom
## Residual deviance: 4.4556e-09 on 758 degrees of freedom
## AIC: 20
##
## Number of Fisher Scoring iterations: 25
#train a model which fits b according to two variables: diastolic and bmi
m <- glm(b ~ diastolic + bmi, family=binomial, data=pima)
summary(m)
##
## Call:
## glm(formula = b ~ diastolic + bmi, family = binomial, data = pima)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9128 -0.9180 -0.6848 1.2336 2.7417
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.629553 0.468176 -7.753 9.01e-15 ***
## diastolic -0.001096 0.004432 -0.247 0.805
## bmi 0.094130 0.012298 7.654 1.95e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 920.65 on 765 degrees of freedom
## AIC: 926.65
##
## Number of Fisher Scoring iterations: 4
The previous result shows that only the bmi variable is significant; create a new reduced model
m <- glm(b ~ bmi, family=binomial, data=pima)
#in this model, b is dependent on bmi (only)
#now we have the model, let's try some predictions
newdata <- data.frame(bmi=32.0)
predict(m, newdata=newdata)
## 1
## -0.6934372
predict(m, type="response", newdata=newdata)
## 1
## 0.3332689
#use type="response" to output probability
The result show that the probability of b = 1 (positive for diabetes) is 33.3%
#let's try another new data
newdata <- data.frame(bmi=67.0)
predict(m, type="response", newdata=newdata)
## 1
## 0.9295718
#the result show that the probability of b = 1 (positive for diabetes) is 92.9% (very likely)