## Homework 1: Classification

Loading the Data (1 point)

# load data
heart_data <- read_csv("heart_disease.csv")
## Rows: 303 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Preprocessing (1 point)

# check for missing data
colSums(is.na(heart_data))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0
# check variable types
glimpse(heart_data)
## Rows: 303
## Columns: 14
## $ age      <dbl> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58, 5…
## $ sex      <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1…
## $ cp       <dbl> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3, 0…
## $ trestbps <dbl> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130, 1…
## $ chol     <dbl> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275, 2…
## $ fbs      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ restecg  <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1…
## $ thalach  <dbl> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139, 1…
## $ exang    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ oldpeak  <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2, 0…
## $ slope    <dbl> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2, 1…
## $ ca       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0…
## $ thal     <dbl> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3…
## $ target   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
# convert necessary variables to factors
heart_data <- heart_data %>%
  mutate(sex = as.factor(sex),
         cp = as.factor(cp),
         fbs = as.factor(fbs),
         restecg = as.factor(restecg),
         exang = as.factor(exang),
         slope = as.factor(slope),
         ca = as.factor(ca),
         thal = as.factor(thal))

Building a Linear Model (1 point)

# build a linear model
model_lm <- lm(target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal, data = heart_data)
summary(model_lm)
## 
## Call:
## lm(formula = target ~ age + sex + cp + trestbps + chol + fbs + 
##     restecg + thalach + exang + oldpeak + slope + ca + thal, 
##     data = heart_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.01468 -0.18519  0.03069  0.22575  1.02624 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.4632832  0.3688720   1.256 0.210183    
## age          0.0027129  0.0026488   1.024 0.306638    
## sex1        -0.1640717  0.0482271  -3.402 0.000766 ***
## cp1          0.1688621  0.0639741   2.640 0.008767 ** 
## cp2          0.2252316  0.0537885   4.187 3.78e-05 ***
## cp3          0.2657100  0.0811006   3.276 0.001184 ** 
## trestbps    -0.0023401  0.0012187  -1.920 0.055844 .  
## chol        -0.0003189  0.0004021  -0.793 0.428416    
## fbs1         0.0333463  0.0574909   0.580 0.562362    
## restecg1     0.0469891  0.0408340   1.151 0.250823    
## restecg2    -0.0870584  0.1757245  -0.495 0.620689    
## thalach      0.0018708  0.0011189   1.672 0.095634 .  
## exang1      -0.0935220  0.0499945  -1.871 0.062437 .  
## oldpeak     -0.0415842  0.0230281  -1.806 0.072023 .  
## slope1      -0.0653951  0.0859299  -0.761 0.447281    
## slope2       0.0736023  0.0950718   0.774 0.439480    
## ca1         -0.2703534  0.0530757  -5.094 6.46e-07 ***
## ca2         -0.3426057  0.0682472  -5.020 9.20e-07 ***
## ca3         -0.2960027  0.0861537  -3.436 0.000680 ***
## ca4          0.0443264  0.1555910   0.285 0.775939    
## thal1        0.2283726  0.2543715   0.898 0.370068    
## thal2        0.2879971  0.2418275   1.191 0.234694    
## thal3        0.0784919  0.2436939   0.322 0.747623    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.335 on 280 degrees of freedom
## Multiple R-squared:  0.5818, Adjusted R-squared:  0.549 
## F-statistic: 17.71 on 22 and 280 DF,  p-value: < 2.2e-16

Evaluating the Linear Model (1 point)

# visualize predictions
heart_data$prediction_lm <- predict(model_lm)
boxplot(heart_data$prediction_lm ~ heart_data$target, main = "Predicted vs. Actual Heart Disease",
        xlab = "Actual Heart Disease", ylab = "Predicted Heart Disease")

An adjusted R-squared value of 0.549 indicates a moderately strong model

Threshold Selection (2 point)

# establish threshold
threshold <- 0.5
heart_data$disease_prediction <- ifelse(heart_data$prediction_lm >= threshold, 1, 0)
accuracy <- sum(heart_data$target == heart_data$disease_prediction) / nrow(heart_data)
print(paste('Accuracy:', accuracy))
## [1] "Accuracy: 0.877887788778878"

Improving the Model with Logistic Regression (1 point)

# run a logistic model
model_glm <- glm(target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal,
                 family = "binomial", data = heart_data)

summary(model_glm)
## 
## Call:
## glm(formula = target ~ age + sex + cp + trestbps + chol + fbs + 
##     restecg + thalach + exang + oldpeak + slope + ca + thal, 
##     family = "binomial", data = heart_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.179045   3.705420   0.048 0.961461    
## age          0.027819   0.025428   1.094 0.273938    
## sex1        -1.862297   0.570844  -3.262 0.001105 ** 
## cp1          0.864708   0.578000   1.496 0.134645    
## cp2          2.003186   0.529356   3.784 0.000154 ***
## cp3          2.417107   0.719242   3.361 0.000778 ***
## trestbps    -0.026162   0.011943  -2.191 0.028481 *  
## chol        -0.004291   0.004245  -1.011 0.312053    
## fbs1         0.445666   0.587977   0.758 0.448472    
## restecg1     0.460582   0.399615   1.153 0.249089    
## restecg2    -0.714204   2.768873  -0.258 0.796453    
## thalach      0.020055   0.011859   1.691 0.090820 .  
## exang1      -0.779111   0.451839  -1.724 0.084652 .  
## oldpeak     -0.397174   0.242346  -1.639 0.101239    
## slope1      -0.775084   0.880495  -0.880 0.378707    
## slope2       0.689965   0.947657   0.728 0.466568    
## ca1         -2.342301   0.527416  -4.441 8.95e-06 ***
## ca2         -3.483178   0.811640  -4.292 1.77e-05 ***
## ca3         -2.247144   0.937629  -2.397 0.016547 *  
## ca4          1.267961   1.720014   0.737 0.461013    
## thal1        2.637558   2.684285   0.983 0.325808    
## thal2        2.367747   2.596159   0.912 0.361759    
## thal3        0.915115   2.600380   0.352 0.724901    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 179.63  on 280  degrees of freedom
## AIC: 225.63
## 
## Number of Fisher Scoring iterations: 6

Evaluating the Logistic Model (1 point)

# find optimal threshold to maximize accuracy
heart_data$prediction_glm <- predict(model_glm, type = "response")
best_threshold <- 0
best_accuracy <- 0
for (threshold in seq(0.4, 0.6, 0.01)) {
  heart_data$disease_prediction <- ifelse(heart_data$prediction_glm >= threshold, 1, 0)
  accuracy <- sum(heart_data$target == heart_data$disease_prediction) / nrow(heart_data)
                  if (accuracy > best_accuracy) {
                    best_threshold <- threshold
                    best_accuracy <- accuracy
                  }
}

print(paste('Best Threshold:', best_threshold, 'Best Accuracy:', best_accuracy))
## [1] "Best Threshold: 0.54 Best Accuracy: 0.887788778877888"

Avoiding Overfitting (2 points)

# evaluate model on unseen data
set.seed(123)
train_index <- sample(1:nrow(heart_data), 0.8 * nrow(heart_data))
train_data <- heart_data[train_index, ]
test_data <- heart_data[-train_index, ]

# rerun logistic model on trained data
train_glm <- glm(target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal,
                 data = train_data,
                 family = "binomial")

test_data$prediction_glm <- predict(train_glm, test_data, type = "response")

# set thresholds
best_threshold <- 0
best_accuracy <- 0

for (threshold in seq(0.4, 0.6, 0.01)) {
  test_data$disease_prediction <- ifelse(test_data$prediction_glm >= threshold, 1, 0)
  accuracy <- sum(test_data$target == test_data$disease_prediction) / nrow(test_data)
                  if (accuracy > best_accuracy) {
                    best_threshold <- threshold
                    best_accuracy <- accuracy
                  }
}

print(paste('Best Threshold:', best_threshold, 'Best Accuracy:', best_accuracy))
## [1] "Best Threshold: 0.4 Best Accuracy: 0.852459016393443"