7. Fit a logistic regression model on the South African Heart Disease Dataset.
The dataset is available from the Elements of Statistical Learning (EOSL) webpage: https://web.stanford.edu/~hastie/ElemStatLearn/ You can also see the data description on EOSL webpage. You can get the data into R by the following code in R.
eosl <- read.table("http://statweb.lsu.edu/faculty/li/data/SAheart.txt",
sep=",", header=T, row.names=1)
Set the ‘Present’ as 1 and ‘Absent’ as 0 for variable ‘famhist’.
levels(eosl$famhist) <- c(0, 1)
eosl$famhist <- as.numeric(as.character(eosl$famhist))
7.b There are 462 observations in the dataset. Randomly split the dataset into 400 observations as the training set. The rest 62 observations as the test set.
set.seed(4)
train <- sample(1:nrow(eosl), 400, replace = F)
trainDt <- eosl[train,]
testDt <- eosl[-train,]
7.c Then fit a logistic regression using ‘famhist’ (now become 0 and 1 binary variable) as the response and all the other variables as the explanatory variables.
fit <- glm(famhist ~ . , family = binomial(link='logit'), data = trainDt)
summary(fit)
##
## Call:
## glm(formula = famhist ~ ., family = binomial(link = "logit"),
## data = trainDt)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6942 -0.9711 -0.6613 1.1294 1.9676
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.249495 1.167428 -2.783 0.00538 **
## sbp -0.003567 0.005800 -0.615 0.53861
## tobacco -0.032501 0.025340 -1.283 0.19964
## ldl 0.061218 0.059692 1.026 0.30510
## adiposity -0.002335 0.026535 -0.088 0.92988
## typea 0.010253 0.011543 0.888 0.37443
## obesity 0.040684 0.038184 1.065 0.28667
## alcohol 0.006697 0.004611 1.452 0.14638
## age 0.028780 0.011028 2.610 0.00906 **
## chd 0.763461 0.249899 3.055 0.00225 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 539.99 on 399 degrees of freedom
## Residual deviance: 496.52 on 390 degrees of freedom
## AIC: 516.52
##
## Number of Fisher Scoring iterations: 4
7.d) Make the prediction on the training and test sets. Using the 0.5 as the cutoff point to get the misclassification rate on the training and test sets, respectively.
predictTest <- as.numeric(predict(fit, testDt[, -which(names(testDt) == 'famhist')], type = 'response') > .5)
predictTraining <- as.numeric(predict(fit, trainDt[, -which(names(testDt) == 'famhist')], type = 'response') > .5)
tibble(value = list(test = testDt$famhist, predictTest = predictTest, training = trainDt$famhist, predictTraining = predictTraining)) %>%
mutate(set = names(value)) %>%
unnest() %>%
group_by(set) %>% summarise(positive = sum(value), negative = sum(value == 0)) -> predictionTable
#gather(outcome, number, positive, negative) ->
kable(predictionTable, caption = 'Prediction table with the fitted model')
Prediction table with the fitted model
| predictTest |
26 |
36 |
| predictTraining |
117 |
283 |
| test |
30 |
32 |
| training |
162 |
238 |
predictionTable %>% filter(set %in% c('predictTest', 'test')) %>%
gather(outcome, number, positive, negative) %>%
group_by(outcome) %>%
summarise(ratio = number[set == 'predictTest']/number[set =='test']) %>%
mutate(ratio = scales::percent(ratio)) %>%
kable(caption = 'Percentage of true predictions for the test dataset')
Percentage of true predictions for the test dataset
| negative |
112.5% |
| positive |
86.7% |
predictionTable %>% filter(set %in% c('predictTraining', 'training')) %>%
gather(outcome, number, positive, negative) %>%
group_by(outcome) %>%
summarise(ratio = number[set == 'predictTraining']/number[set =='training']) %>%
mutate(ratio = scales::percent(ratio)) %>%
kable(caption = 'Percentage of true predictions for the training dataset')
Percentage of true predictions for the training dataset
| negative |
118.9% |
| positive |
72.2% |
7.e) Find the AUC score and plot the ROC curve based on the test set performance.
library(AUC)
# sensibility <- function(x) {
# tibble(outcome = testDt$famhist,
# prob = predict(fit, select(trainDt, -famhist), type='response')) %>%
# mutate(predicted = ifelse(prob >= x, 1, 0)) %>%
# count(outcome, predicted) %>%
# complete(outcome = c(0,1), predicted = c(0,1), fill = list(n = 0)) %>%
# group_by(outcome) %>%
# summarise(ratio = n[predicted == outcome]/ sum(n))
# }
# seq(0, 1, .01) %>% map(sensibility) -> tt
# tibble(truePositive = unlist(tt %>% map('ratio') %>% map(`[`,1)),
# falsePositive = 1 - unlist(tt %>% map('ratio') %>% map(`[`,2))) -> dd
dtROC <- roc(predict(fit, select(testDt, -famhist), type='response'), factor(testDt$famhist))
ggplot() +
geom_line(aes(x = dtROC$fpr, y = dtROC$tpr)) +
geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1, colour = "segment")) +
ggtitle('ROC curve') +
xlab('False positive - 1-sensitivity') +
ylab('True Positive - specificity') +
theme(legend.position = 'none')

#AUC score
auc(dtROC)
## [1] 0.703125