Jackson_Garrett

## Homework 1: Classification

Loading the Data (1 point)

# load data
heart_data <- read_csv("heart_disease.csv")

## Rows: 303 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Preprocessing (1 point)

# check for missing data
colSums(is.na(heart_data))

##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0

# check variable types
glimpse(heart_data)

## Rows: 303
## Columns: 14
## $ age      <dbl> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58, 5…
## $ sex      <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1…
## $ cp       <dbl> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3, 0…
## $ trestbps <dbl> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130, 1…
## $ chol     <dbl> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275, 2…
## $ fbs      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ restecg  <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1…
## $ thalach  <dbl> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139, 1…
## $ exang    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ oldpeak  <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2, 0…
## $ slope    <dbl> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2, 1…
## $ ca       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0…
## $ thal     <dbl> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3…
## $ target   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…

# convert necessary variables to factors
heart_data <- heart_data %>%
  mutate(sex = as.factor(sex),
         cp = as.factor(cp),
         fbs = as.factor(fbs),
         restecg = as.factor(restecg),
         exang = as.factor(exang),
         slope = as.factor(slope),
         ca = as.factor(ca),
         thal = as.factor(thal))

Building a Linear Model (1 point)

# build a linear model
model_lm <- lm(target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal, data = heart_data)
summary(model_lm)

## 
## Call:
## lm(formula = target ~ age + sex + cp + trestbps + chol + fbs + 
##     restecg + thalach + exang + oldpeak + slope + ca + thal, 
##     data = heart_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.01468 -0.18519  0.03069  0.22575  1.02624 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.4632832  0.3688720   1.256 0.210183    
## age          0.0027129  0.0026488   1.024 0.306638    
## sex1        -0.1640717  0.0482271  -3.402 0.000766 ***
## cp1          0.1688621  0.0639741   2.640 0.008767 ** 
## cp2          0.2252316  0.0537885   4.187 3.78e-05 ***
## cp3          0.2657100  0.0811006   3.276 0.001184 ** 
## trestbps    -0.0023401  0.0012187  -1.920 0.055844 .  
## chol        -0.0003189  0.0004021  -0.793 0.428416    
## fbs1         0.0333463  0.0574909   0.580 0.562362    
## restecg1     0.0469891  0.0408340   1.151 0.250823    
## restecg2    -0.0870584  0.1757245  -0.495 0.620689    
## thalach      0.0018708  0.0011189   1.672 0.095634 .  
## exang1      -0.0935220  0.0499945  -1.871 0.062437 .  
## oldpeak     -0.0415842  0.0230281  -1.806 0.072023 .  
## slope1      -0.0653951  0.0859299  -0.761 0.447281    
## slope2       0.0736023  0.0950718   0.774 0.439480    
## ca1         -0.2703534  0.0530757  -5.094 6.46e-07 ***
## ca2         -0.3426057  0.0682472  -5.020 9.20e-07 ***
## ca3         -0.2960027  0.0861537  -3.436 0.000680 ***
## ca4          0.0443264  0.1555910   0.285 0.775939    
## thal1        0.2283726  0.2543715   0.898 0.370068    
## thal2        0.2879971  0.2418275   1.191 0.234694    
## thal3        0.0784919  0.2436939   0.322 0.747623    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.335 on 280 degrees of freedom
## Multiple R-squared:  0.5818, Adjusted R-squared:  0.549 
## F-statistic: 17.71 on 22 and 280 DF,  p-value: < 2.2e-16

Evaluating the Linear Model (1 point)

# visualize predictions
heart_data$prediction_lm <- predict(model_lm)
boxplot(heart_data$prediction_lm ~ heart_data$target, main = "Predicted vs. Actual Heart Disease",
        xlab = "Actual Heart Disease", ylab = "Predicted Heart Disease")

An adjusted R-squared value of 0.549 indicates a moderately strong model

Threshold Selection (2 point)

# establish threshold
threshold <- 0.5
heart_data$disease_prediction <- ifelse(heart_data$prediction_lm >= threshold, 1, 0)
accuracy <- sum(heart_data$target == heart_data$disease_prediction) / nrow(heart_data)
print(paste('Accuracy:', accuracy))

## [1] "Accuracy: 0.877887788778878"

Improving the Model with Logistic Regression (1 point)

# run a logistic model
model_glm <- glm(target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal,
                 family = "binomial", data = heart_data)

summary(model_glm)

## 
## Call:
## glm(formula = target ~ age + sex + cp + trestbps + chol + fbs + 
##     restecg + thalach + exang + oldpeak + slope + ca + thal, 
##     family = "binomial", data = heart_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.179045   3.705420   0.048 0.961461    
## age          0.027819   0.025428   1.094 0.273938    
## sex1        -1.862297   0.570844  -3.262 0.001105 ** 
## cp1          0.864708   0.578000   1.496 0.134645    
## cp2          2.003186   0.529356   3.784 0.000154 ***
## cp3          2.417107   0.719242   3.361 0.000778 ***
## trestbps    -0.026162   0.011943  -2.191 0.028481 *  
## chol        -0.004291   0.004245  -1.011 0.312053    
## fbs1         0.445666   0.587977   0.758 0.448472    
## restecg1     0.460582   0.399615   1.153 0.249089    
## restecg2    -0.714204   2.768873  -0.258 0.796453    
## thalach      0.020055   0.011859   1.691 0.090820 .  
## exang1      -0.779111   0.451839  -1.724 0.084652 .  
## oldpeak     -0.397174   0.242346  -1.639 0.101239    
## slope1      -0.775084   0.880495  -0.880 0.378707    
## slope2       0.689965   0.947657   0.728 0.466568    
## ca1         -2.342301   0.527416  -4.441 8.95e-06 ***
## ca2         -3.483178   0.811640  -4.292 1.77e-05 ***
## ca3         -2.247144   0.937629  -2.397 0.016547 *  
## ca4          1.267961   1.720014   0.737 0.461013    
## thal1        2.637558   2.684285   0.983 0.325808    
## thal2        2.367747   2.596159   0.912 0.361759    
## thal3        0.915115   2.600380   0.352 0.724901    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 179.63  on 280  degrees of freedom
## AIC: 225.63
## 
## Number of Fisher Scoring iterations: 6

Evaluating the Logistic Model (1 point)

# find optimal threshold to maximize accuracy
heart_data$prediction_glm <- predict(model_glm, type = "response")
best_threshold <- 0
best_accuracy <- 0
for (threshold in seq(0.4, 0.6, 0.01)) {
  heart_data$disease_prediction <- ifelse(heart_data$prediction_glm >= threshold, 1, 0)
  accuracy <- sum(heart_data$target == heart_data$disease_prediction) / nrow(heart_data)
                  if (accuracy > best_accuracy) {
                    best_threshold <- threshold
                    best_accuracy <- accuracy
                  }
}

print(paste('Best Threshold:', best_threshold, 'Best Accuracy:', best_accuracy))

## [1] "Best Threshold: 0.54 Best Accuracy: 0.887788778877888"

Avoiding Overfitting (2 points)

# evaluate model on unseen data
set.seed(123)
train_index <- sample(1:nrow(heart_data), 0.8 * nrow(heart_data))
train_data <- heart_data[train_index, ]
test_data <- heart_data[-train_index, ]

# rerun logistic model on trained data
train_glm <- glm(target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + exang + oldpeak + slope + ca + thal,
                 data = train_data,
                 family = "binomial")

test_data$prediction_glm <- predict(train_glm, test_data, type = "response")

# set thresholds
best_threshold <- 0
best_accuracy <- 0

for (threshold in seq(0.4, 0.6, 0.01)) {
  test_data$disease_prediction <- ifelse(test_data$prediction_glm >= threshold, 1, 0)
  accuracy <- sum(test_data$target == test_data$disease_prediction) / nrow(test_data)
                  if (accuracy > best_accuracy) {
                    best_threshold <- threshold
                    best_accuracy <- accuracy
                  }
}

print(paste('Best Threshold:', best_threshold, 'Best Accuracy:', best_accuracy))

## [1] "Best Threshold: 0.4 Best Accuracy: 0.852459016393443"

Jackson_Garrett_Module1HW

Garrett Jackson

2025-04-09

Loading the Data (1 point)

Data Preprocessing (1 point)

Building a Linear Model (1 point)

Evaluating the Linear Model (1 point)

Threshold Selection (2 point)

Improving the Model with Logistic Regression (1 point)

Evaluating the Logistic Model (1 point)

Avoiding Overfitting (2 points)