## Homework 1: Classification
# load data
heart_data <- read_csv("heart_disease.csv")
## Rows: 303 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# check for missing data
colSums(is.na(heart_data))
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal target
## 0 0 0 0 0 0
# check variable types
glimpse(heart_data)
## Rows: 303
## Columns: 14
## $ age <dbl> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58, 5…
## $ sex <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1…
## $ cp <dbl> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3, 0…
## $ trestbps <dbl> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130, 1…
## $ chol <dbl> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275, 2…
## $ fbs <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ restecg <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1…
## $ thalach <dbl> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139, 1…
## $ exang <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ oldpeak <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2, 0…
## $ slope <dbl> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2, 1…
## $ ca <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0…
## $ thal <dbl> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3…
## $ target <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
# convert necessary variables to factors
heart_data <- heart_data %>%
mutate(sex = as.factor(sex),
cp = as.factor(cp),
fbs = as.factor(fbs),
restecg = as.factor(restecg),
exang = as.factor(exang),
slope = as.factor(slope),
ca = as.factor(ca),
thal = as.factor(thal))
# build a linear model
model_lm <- lm(target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal, data = heart_data)
summary(model_lm)
##
## Call:
## lm(formula = target ~ age + sex + cp + trestbps + chol + fbs +
## restecg + thalach + exang + oldpeak + slope + ca + thal,
## data = heart_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01468 -0.18519 0.03069 0.22575 1.02624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4632832 0.3688720 1.256 0.210183
## age 0.0027129 0.0026488 1.024 0.306638
## sex1 -0.1640717 0.0482271 -3.402 0.000766 ***
## cp1 0.1688621 0.0639741 2.640 0.008767 **
## cp2 0.2252316 0.0537885 4.187 3.78e-05 ***
## cp3 0.2657100 0.0811006 3.276 0.001184 **
## trestbps -0.0023401 0.0012187 -1.920 0.055844 .
## chol -0.0003189 0.0004021 -0.793 0.428416
## fbs1 0.0333463 0.0574909 0.580 0.562362
## restecg1 0.0469891 0.0408340 1.151 0.250823
## restecg2 -0.0870584 0.1757245 -0.495 0.620689
## thalach 0.0018708 0.0011189 1.672 0.095634 .
## exang1 -0.0935220 0.0499945 -1.871 0.062437 .
## oldpeak -0.0415842 0.0230281 -1.806 0.072023 .
## slope1 -0.0653951 0.0859299 -0.761 0.447281
## slope2 0.0736023 0.0950718 0.774 0.439480
## ca1 -0.2703534 0.0530757 -5.094 6.46e-07 ***
## ca2 -0.3426057 0.0682472 -5.020 9.20e-07 ***
## ca3 -0.2960027 0.0861537 -3.436 0.000680 ***
## ca4 0.0443264 0.1555910 0.285 0.775939
## thal1 0.2283726 0.2543715 0.898 0.370068
## thal2 0.2879971 0.2418275 1.191 0.234694
## thal3 0.0784919 0.2436939 0.322 0.747623
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.335 on 280 degrees of freedom
## Multiple R-squared: 0.5818, Adjusted R-squared: 0.549
## F-statistic: 17.71 on 22 and 280 DF, p-value: < 2.2e-16
# visualize predictions
heart_data$prediction_lm <- predict(model_lm)
boxplot(heart_data$prediction_lm ~ heart_data$target, main = "Predicted vs. Actual Heart Disease",
xlab = "Actual Heart Disease", ylab = "Predicted Heart Disease")
An adjusted R-squared value of 0.549 indicates a moderately strong
model
# establish threshold
threshold <- 0.5
heart_data$disease_prediction <- ifelse(heart_data$prediction_lm >= threshold, 1, 0)
accuracy <- sum(heart_data$target == heart_data$disease_prediction) / nrow(heart_data)
print(paste('Accuracy:', accuracy))
## [1] "Accuracy: 0.877887788778878"
# run a logistic model
model_glm <- glm(target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal,
family = "binomial", data = heart_data)
summary(model_glm)
##
## Call:
## glm(formula = target ~ age + sex + cp + trestbps + chol + fbs +
## restecg + thalach + exang + oldpeak + slope + ca + thal,
## family = "binomial", data = heart_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.179045 3.705420 0.048 0.961461
## age 0.027819 0.025428 1.094 0.273938
## sex1 -1.862297 0.570844 -3.262 0.001105 **
## cp1 0.864708 0.578000 1.496 0.134645
## cp2 2.003186 0.529356 3.784 0.000154 ***
## cp3 2.417107 0.719242 3.361 0.000778 ***
## trestbps -0.026162 0.011943 -2.191 0.028481 *
## chol -0.004291 0.004245 -1.011 0.312053
## fbs1 0.445666 0.587977 0.758 0.448472
## restecg1 0.460582 0.399615 1.153 0.249089
## restecg2 -0.714204 2.768873 -0.258 0.796453
## thalach 0.020055 0.011859 1.691 0.090820 .
## exang1 -0.779111 0.451839 -1.724 0.084652 .
## oldpeak -0.397174 0.242346 -1.639 0.101239
## slope1 -0.775084 0.880495 -0.880 0.378707
## slope2 0.689965 0.947657 0.728 0.466568
## ca1 -2.342301 0.527416 -4.441 8.95e-06 ***
## ca2 -3.483178 0.811640 -4.292 1.77e-05 ***
## ca3 -2.247144 0.937629 -2.397 0.016547 *
## ca4 1.267961 1.720014 0.737 0.461013
## thal1 2.637558 2.684285 0.983 0.325808
## thal2 2.367747 2.596159 0.912 0.361759
## thal3 0.915115 2.600380 0.352 0.724901
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 179.63 on 280 degrees of freedom
## AIC: 225.63
##
## Number of Fisher Scoring iterations: 6
# find optimal threshold to maximize accuracy
heart_data$prediction_glm <- predict(model_glm, type = "response")
best_threshold <- 0
best_accuracy <- 0
for (threshold in seq(0.4, 0.6, 0.01)) {
heart_data$disease_prediction <- ifelse(heart_data$prediction_glm >= threshold, 1, 0)
accuracy <- sum(heart_data$target == heart_data$disease_prediction) / nrow(heart_data)
if (accuracy > best_accuracy) {
best_threshold <- threshold
best_accuracy <- accuracy
}
}
print(paste('Best Threshold:', best_threshold, 'Best Accuracy:', best_accuracy))
## [1] "Best Threshold: 0.54 Best Accuracy: 0.887788778877888"
# evaluate model on unseen data
set.seed(123)
train_index <- sample(1:nrow(heart_data), 0.8 * nrow(heart_data))
train_data <- heart_data[train_index, ]
test_data <- heart_data[-train_index, ]
# rerun logistic model on trained data
train_glm <- glm(target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal,
data = train_data,
family = "binomial")
test_data$prediction_glm <- predict(train_glm, test_data, type = "response")
# set thresholds
best_threshold <- 0
best_accuracy <- 0
for (threshold in seq(0.4, 0.6, 0.01)) {
test_data$disease_prediction <- ifelse(test_data$prediction_glm >= threshold, 1, 0)
accuracy <- sum(test_data$target == test_data$disease_prediction) / nrow(test_data)
if (accuracy > best_accuracy) {
best_threshold <- threshold
best_accuracy <- accuracy
}
}
print(paste('Best Threshold:', best_threshold, 'Best Accuracy:', best_accuracy))
## [1] "Best Threshold: 0.4 Best Accuracy: 0.852459016393443"