# Importing LI Data
LI <- c(8,8,10,10,12,12,12,14,14,14,16,16,16,18,20,20,20,22,22,24,26,28,32,34,38,38,38)
y <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,1,1,0,1,1,1,0)
df <- data.frame(LI, y)
# Modeling
fit <- glm(y ~ LI, family = binomial, data = df)
gam.fit <- gam(y ~ s(LI), family = binomial, data = df)
# Summarizing
summary(fit)
##
## Call:
## glm(formula = y ~ LI, family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9448 -0.6465 -0.4947 0.6571 1.6971
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.77714 1.37862 -2.740 0.00615 **
## LI 0.14486 0.05934 2.441 0.01464 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 34.372 on 26 degrees of freedom
## Residual deviance: 26.073 on 25 degrees of freedom
## AIC: 30.073
##
## Number of Fisher Scoring iterations: 4
confint(fit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -6.9951909 -1.4098443
## LI 0.0425232 0.2846668
plot(jitter(y, 0.08) ~ LI, data = df)
curve(predict(gam.fit, data.frame(LI = x), type = "resp"), add = TRUE, col = "blue")
curve(predict(fit, data.frame(LI = x), type = "resp"), add = TRUE, col = "green")
legend(x = 7.5, y = .9, legend = c("Blue: GAM", "Green: logistic"))
# Attempting to overlay models on Jittered data with GGplot
#ggplot(df, aes(x = LI, y = y)) +
#geom_smooth(method = "glm", data = fit, formula = y ~ LI, n = 27, aes(color = "red")) +
#geom_smooth(method = "gam", data = gam.fit, formula = y ~ LI, n = 27, aes(color = "blue")) +
#geom_jitter(width = .01, height = 0.01, inherit.aes = TRUE)
findInt <- function(model, value){
function(x) {
predict(model, data.frame(LI = x), type = "resp") - value
}
}
# LD 75
LD75 <- uniroot(findInt(fit, .75), range(df$LI))$root; LD75
## [1] 33.65764
# Updating Plot
plot(jitter(y, 0.08) ~ LI, data = df)
curve(predict(fit, data.frame(LI = x), type = "resp"), add = TRUE, col = "green")
abline(v = LD75, col = "red")
legend(x = 7.5, y = .9, legend = c("Green: logistic", "Red: LD75"))
The predicted response reaches 75% at a labeling index value of 33.6576408. That is, at an LI of 33.6576408 the estimated probability of a cancer patient achieving remission is 75%.
# LD 10
LD10 <- uniroot(findInt(fit, .1), range(df$LI))$root; LD10
## [1] 10.90626
plot(jitter(y, 0.08) ~ LI, data = df)
curve(predict(fit, data.frame(LI = x), type = "resp"), add = TRUE, col = "red")
abline(v = LD75, col = "red")
abline(v = LD10, col = "purple")
legend(x = 7.5, y = .9, legend = c("Green: logistic", "Red: LD75", "Purple: LD10"))
The predicted response reaches 10% at a labeling index value of 10.9062595. That is, at an LI of 10.9062595 the estimated probability of a cancer patient achieving remission is 10%.
# Importing Crabs Data
df <- read.table(file = "http://users.stat.ufl.edu/~aa/intro-cda/data/Crabs.dat(Table%203.2)",
header = TRUE)
kable(head(df, n = 5))
| crab | sat | y | weight | width | color | spine |
|---|---|---|---|---|---|---|
| 1 | 8 | 1 | 3.05 | 28.3 | 2 | 3 |
| 2 | 0 | 0 | 1.55 | 22.5 | 3 | 3 |
| 3 | 9 | 1 | 2.30 | 26.0 | 1 | 1 |
| 4 | 0 | 0 | 2.10 | 24.8 | 3 | 3 |
| 5 | 4 | 1 | 2.60 | 26.0 | 3 | 3 |
fit <- glm(y ~ weight, family = binomial, data = df); fit
##
## Call: glm(formula = y ~ weight, family = binomial, data = df)
##
## Coefficients:
## (Intercept) weight
## -3.695 1.815
##
## Degrees of Freedom: 172 Total (i.e. Null); 171 Residual
## Null Deviance: 225.8
## Residual Deviance: 195.7 AIC: 199.7
s.fit <- summary(fit); s.fit
##
## Call:
## glm(formula = y ~ weight, family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1108 -1.0749 0.5426 0.9122 1.6285
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6947 0.8802 -4.198 2.70e-05 ***
## weight 1.8151 0.3767 4.819 1.45e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 195.74 on 171 degrees of freedom
## AIC: 199.74
##
## Number of Fisher Scoring iterations: 4
c <- fit$coefficients
ML Prediction Equation
\[logit[\hat{\pi(x)}] = -3.6947264 + 1.8151446x\]
\[\hat{\pi(x)} = \frac{e^{-3.6947264 + 1.8151446x}}{1 + e^{-3.6947264 + 1.8151446x}}\]
c[2]*1
## weight
## 1.815145
c[2]*0.1
## weight
## 0.1815145
c[2]*0.58
## weight
## 1.052784
m <- margins(fit); m
## Average marginal effects
## glm(formula = y ~ weight, family = binomial, data = df)
## weight
## 0.3495
# Alternative function for average marginal effect
# logitmfx(fit, atmean = FALSE, data = df)
The average marginal effect of weight per 1kg increase is represented by 0.3495341, thus the average marginal effect of weight per .1kg increase is one tenth of that effect, 0.0349534. The average marginal effect of a .1kg increase in weight can be interpreted as the average change in the probability of satellite as weight increases by .1kg is 0.0349534.
# Sample proportion of y = 1
prop <- sum(df$y)/nrow(df); prop
## [1] 0.6416185
# Classification table
predicted <- as.numeric(fitted(fit) > prop)
tabs <- xtabs(~ df$y + predicted); tabs
## predicted
## df$y 0 1
## 0 45 17
## 1 43 68
# Sensitivity P[y]
sens <- tabs[4]/(tabs[2]+tabs[4]); sens
## [1] 0.6126126
# Specificity
spef <- tabs[1]/(tabs[1]+tabs[3]); spef
## [1] 0.7258065
A sensitivity of 0.6126126 means that the probability of predicting a crab having a satellite using the model given that a crab truly has a satellite is 61.2612613%.
A specificity of 0.7258065 means that the probability of predicting a crab does not have a satellite using the model given that a crab truly does not have a satellite is 72.5806452%.
rocplot <- roc(y ~ fitted(fit), data = df)
plot.roc(rocplot, legacy.axes=FALSE)
AUC <- auc(rocplot); AUC
## Area under the curve: 0.7379
concordance <- AUC[1]
The area under the ROC curve is a measure of the predictive power of a model also known as the concordance index. This model has a concordance index of 0.7379396, indicating that the estimated probability that predictions and outcomes are concordant is 73.7939552%. A concordance index of 1 represents a model which perfectly predicts the presence of a satellite, whereas a concordance index of 0.5 is the equivalent of random guessing. The concordance index of this model using crab width as the only predictor indicates that it does a fair job of correctly predicting outcomes.
df <- read.table(file = "http://users.stat.ufl.edu/~aa/intro-cda/data/MBTI.dat",
header = TRUE)
df$no <- df$n - df$drink; df$yes <- df$drink
kable(df)
| EI | SN | TF | JP | smoke | drink | n | no | yes |
|---|---|---|---|---|---|---|---|---|
| e | s | t | j | 13 | 10 | 77 | 67 | 10 |
| e | s | t | p | 11 | 8 | 42 | 34 | 8 |
| e | s | f | j | 16 | 5 | 106 | 101 | 5 |
| e | s | f | p | 19 | 7 | 79 | 72 | 7 |
| e | n | t | j | 6 | 3 | 23 | 20 | 3 |
| e | n | t | p | 4 | 2 | 18 | 16 | 2 |
| e | n | f | j | 6 | 4 | 31 | 27 | 4 |
| e | n | f | p | 23 | 15 | 80 | 65 | 15 |
| i | s | t | j | 32 | 17 | 140 | 123 | 17 |
| i | s | t | p | 9 | 3 | 52 | 49 | 3 |
| i | s | f | j | 34 | 6 | 138 | 132 | 6 |
| i | s | f | p | 29 | 4 | 106 | 102 | 4 |
| i | n | t | j | 4 | 1 | 13 | 12 | 1 |
| i | n | t | p | 9 | 5 | 35 | 30 | 5 |
| i | n | f | j | 4 | 1 | 31 | 30 | 1 |
| i | n | f | p | 22 | 6 | 79 | 73 | 6 |
fit <- glm(yes/(yes+no) ~ EI + SN + TF + JP,
weights = yes + no,
family = binomial(link = "logit"),
data = df); fit
##
## Call: glm(formula = yes/(yes + no) ~ EI + SN + TF + JP, family = binomial(link = "logit"),
## data = df, weights = yes + no)
##
## Coefficients:
## (Intercept) EIi SNs TFt JPp
## -2.1140 -0.5550 -0.4292 0.6873 0.2022
##
## Degrees of Freedom: 15 Total (i.e. Null); 11 Residual
## Null Deviance: 30.49
## Residual Deviance: 11.15 AIC: 73.99
c <- fit$coefficients
summary(fit)
##
## Call:
## glm(formula = yes/(yes + no) ~ EI + SN + TF + JP, family = binomial(link = "logit"),
## data = df, weights = yes + no)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2712 -0.8062 -0.1063 0.1124 1.5807
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1140 0.2715 -7.788 6.82e-15 ***
## EIi -0.5550 0.2170 -2.558 0.01053 *
## SNs -0.4292 0.2340 -1.834 0.06664 .
## TFt 0.6873 0.2206 3.116 0.00184 **
## JPp 0.2022 0.2266 0.893 0.37209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 30.488 on 15 degrees of freedom
## Residual deviance: 11.149 on 11 degrees of freedom
## AIC: 73.99
##
## Number of Fisher Scoring iterations: 4
Prediction Equation
\[logit[\hat{\pi(x)}] = -2.114047 + -0.5550115I + -0.4291508S + 0.6873349T + 0.2022295P\]
\[\hat{\pi(x)} = \frac{e^{-2.114047 + -0.5550115I + -0.4291508S + 0.6873349T + 0.2022295P}}{1 + e^{-2.114047 + -0.5550115I + -0.4291508S + 0.6873349T + 0.2022295P}}\]
By default, R set E, N, F & J to be built into the intercept term leaving estimated regression coefficients for I = 1, S = 1, T = 1 & P = 1.
It follows logically why the predicted probability of drinking alcohol frequently is associated with the ENTP run; the coefficients for I = S = 1 are negative while the coefficients for T = P = 1 are positive. When I = S = 0 and T = P = 1, the negative coefficients have no impact while the positive coefficients are impacting the predicted probability.
\(\hat{\pi(ENTP)} = \frac{e^{-2.114047 + -0.5550115I + -0.4291508S + 0.6873349T + 0.2022295P}}{1 + e^{-2.114047 + -0.5550115I + -0.4291508S + 0.6873349T + 0.2022295P}}\)
\(\frac{e^{-2.114047 + -0.5550115(0) + -0.4291508(0) + 0.6873349(1) + 0.2022295(1)}}{1 + e^{-2.114047 + -0.5550115(0) + -0.4291508(0) + 0.6873349(1) + 0.2022295(1)}} = \frac{0.2939097}{1 + 0.2939097}\)
\(0.2271486\)
pred.prob <- fitted(fit)
lp <- predict(fit, se.fit = TRUE)
LB <- lp$fit - 1.96*lp$se.fit
UB <- lp$fit + 1.96*lp$se.fit
LB.p <- exp(LB)/(1+exp(LB))
UB.p <- exp(UB)/(1+exp(UB))
out <- data.frame(df$EI, df$SN, df$TF, df$JP,
pred.prob,
LB.p,
UB.p)
kable(out)
| df.EI | df.SN | df.TF | df.JP | pred.prob | LB.p | UB.p |
|---|---|---|---|---|---|---|
| e | s | t | j | 0.1351860 | 0.0931166 | 0.1922332 |
| e | s | t | p | 0.1606185 | 0.1046828 | 0.2384813 |
| e | s | f | j | 0.0728848 | 0.0475777 | 0.1100969 |
| e | s | f | p | 0.0877863 | 0.0569453 | 0.1329754 |
| e | n | t | j | 0.1936115 | 0.1222293 | 0.2927765 |
| e | n | t | p | 0.2271486 | 0.1504295 | 0.3278934 |
| e | n | f | j | 0.1077390 | 0.0662294 | 0.1705142 |
| e | n | f | p | 0.1287768 | 0.0873674 | 0.1858168 |
| i | s | t | j | 0.0823472 | 0.0557648 | 0.1199910 |
| i | s | t | p | 0.0989769 | 0.0633996 | 0.1512937 |
| i | s | f | j | 0.0431812 | 0.0272289 | 0.0678276 |
| i | s | f | p | 0.0523527 | 0.0329641 | 0.0821759 |
| i | n | t | j | 0.1211352 | 0.0727629 | 0.1949056 |
| i | n | t | p | 0.1443656 | 0.0911432 | 0.2211060 |
| i | n | f | j | 0.0648240 | 0.0376312 | 0.1094325 |
| i | n | f | p | 0.0782166 | 0.0500682 | 0.1201877 |
df <- read.table(file = "http://users.stat.ufl.edu/~aa/intro-cda/data/Infection.dat(Table%204.16)",
header = TRUE)
kable(head(df, n = 5))
| center | treat | y | n |
|---|---|---|---|
| 1 | 1 | 11 | 36 |
| 1 | 0 | 10 | 37 |
| 2 | 1 | 16 | 20 |
| 2 | 0 | 22 | 32 |
| 3 | 1 | 14 | 19 |
df <- read.table(file = "http://users.stat.ufl.edu/~aa/intro-cda/data/SoreThroat.dat(Table%204.19)",
header = TRUE,
skip = 5)
kable(head(df, n = 5), caption = "D = surgery duration (in min), T = type of device (0 = laryngeal mask airway, 1 = tracheal tube), Y = sore throat on waking (1=yes, 0=no)")
D = surgery duration (in min), T = type of device (0 = laryngeal mask airway, 1 = tracheal tube), Y = sore throat on waking (1=yes, 0=no)
| D | T | Y |
|---|---|---|
| 45 | 0 | 0 |
| 15 | 0 | 0 |
| 40 | 0 | 1 |
| 83 | 1 | 1 |
| 90 | 1 | 1 |