Import and View Data

Import heart disease dataset:

heart <- read_csv("heart.csv", col_types = "nffnnffnfnfnff")
glimpse(heart)

## Rows: 1,025
## Columns: 14
## $ age      <dbl> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51, 52, 3…
## $ sex      <fct> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1…
## $ cp       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 2…
## $ trestbps <dbl> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112, 132, 1…
## $ chol     <dbl> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149, 341, 2…
## $ fbs      <fct> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ restecg  <fct> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0…
## $ thalach  <dbl> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125, 136, 1…
## $ exang    <fct> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0…
## $ oldpeak  <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6, 3.0, 0…
## $ slope    <fct> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1…
## $ ca       <dbl> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0…
## $ thal     <fct> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2, 2, 2…
## $ target   <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0…

Show a summary of the data:

summary(heart)

##       age        sex     cp         trestbps          chol     fbs     restecg
##  Min.   :29.00   1:713   0:497   Min.   : 94.0   Min.   :126   0:872   1:513  
##  1st Qu.:48.00   0:312   1:167   1st Qu.:120.0   1st Qu.:211   1:153   0:497  
##  Median :56.00           2:284   Median :130.0   Median :240           2: 15  
##  Mean   :54.43           3: 77   Mean   :131.6   Mean   :246                  
##  3rd Qu.:61.00                   3rd Qu.:140.0   3rd Qu.:275                  
##  Max.   :77.00                   Max.   :200.0   Max.   :564                  
##     thalach      exang      oldpeak      slope         ca         thal   
##  Min.   : 71.0   0:680   Min.   :0.000   2:469   Min.   :0.0000   3:410  
##  1st Qu.:132.0   1:345   1st Qu.:0.000   0: 74   1st Qu.:0.0000   2:544  
##  Median :152.0           Median :0.800   1:482   Median :0.0000   1: 64  
##  Mean   :149.1           Mean   :1.072           Mean   :0.7541   0:  7  
##  3rd Qu.:166.0           3rd Qu.:1.800           3rd Qu.:1.0000          
##  Max.   :202.0           Max.   :6.200           Max.   :4.0000          
##  target 
##  0:499  
##  1:526  
##         
##         
##         
##

Clean and Prep Data

Check for missing data:

which(is.na(heart))

## integer(0)

Normalize the data:

# create function that scales data from 0 to 1
min_max_scaler <- function(x) {
  # x should be numeric vector
  return((x - min(x)) / (max(x) - min(x)))
}

numeric_fields <- colnames(select_if(heart, is.numeric))
heart <- heart %>%
  mutate(across(all_of(numeric_fields), min_max_scaler))

# can also use the caret library
# process <- preProcess(heart, method=c("range"))
# data <- predict(process, heart)

summary(heart)

##       age         sex     cp         trestbps           chol        fbs    
##  Min.   :0.0000   1:713   0:497   Min.   :0.0000   Min.   :0.0000   0:872  
##  1st Qu.:0.3958   0:312   1:167   1st Qu.:0.2453   1st Qu.:0.1941   1:153  
##  Median :0.5625           2:284   Median :0.3396   Median :0.2603          
##  Mean   :0.5299           3: 77   Mean   :0.3548   Mean   :0.2740          
##  3rd Qu.:0.6667                   3rd Qu.:0.4340   3rd Qu.:0.3402          
##  Max.   :1.0000                   Max.   :1.0000   Max.   :1.0000          
##  restecg    thalach       exang      oldpeak       slope         ca        
##  1:513   Min.   :0.0000   0:680   Min.   :0.0000   2:469   Min.   :0.0000  
##  0:497   1st Qu.:0.4656   1:345   1st Qu.:0.0000   0: 74   1st Qu.:0.0000  
##  2: 15   Median :0.6183           Median :0.1290   1:482   Median :0.0000  
##          Mean   :0.5963           Mean   :0.1728           Mean   :0.1885  
##          3rd Qu.:0.7252           3rd Qu.:0.2903           3rd Qu.:0.2500  
##          Max.   :1.0000           Max.   :1.0000           Max.   :1.0000  
##  thal    target 
##  3:410   0:499  
##  2:544   1:526  
##  1: 64          
##  0:  7          
##                 
##

Normalization is important while using the KNN model due to the fact that features with a wider range of values can have a disproportionate impact on Euclidean distance calculations (which is the main calculation implemented by the KNN model). The above confirms all numeric fields have a min of 0 and a max of 1.

Next, we create dummy variables for the categorical predictor fields:

categorical_fields <- head(colnames(select_if(heart, is.factor)), -1)
heart <- dummy_cols(heart, 
                    select_columns = categorical_fields,
                    remove_first_dummy = TRUE,
                    remove_selected_columns = TRUE)

Split the the data into a training and testing set:

# define training data cutoff
train_size = 0.75
cutoff = floor(train_size * nrow(heart))

# split data randomly into train and test set
set.seed(1234)
train_ind <- sample(seq_len(nrow(heart)), size=cutoff)
train <- heart[train_ind, ]
test <- heart[-train_ind, ]

# separate into target and predictors 
train_x <- select(train, -target)
train_y <- select(train, target)$target
test_x <- select(test, -target)
test_y <- select(test, target)$target

Make Inital KNN Predictions

Make predictions using the KNN model (without any tuning):

heart_pred_inital <- 
  knn(
    train = train_x,
    test = test_x,
    cl = train_y,
    k = 5
)
head(heart_pred_inital)

## [1] 1 0 0 1 0 1
## Levels: 0 1

Make a confusion matrix showing how the model performed on the test data:

pred_inital_tab <- table(test_y, heart_pred_inital)
pred_inital_tab

##       heart_pred_inital
## test_y   0   1
##      0 105  28
##      1  28  96

sum(diag(pred_inital_tab)) / nrow(test_x)

## [1] 0.7821012

Initial model has an accuracy of ~78% using 5 neighbors.

Improve Model

The only available parameter that can be tuned using the KNN model is the number of neighbors \(n\).

Evaluate train and test accuracy using different values of \(n\):

# create function to repeatedly run KNN model
run_knn <- function(train_x, train_y, test_x, test_y, n) {
  pred <- 
    knn(
      train = train_x,
      test = test_x,
      cl = train_y,
      k = n
  )
  pred_tab <- table(test_y, pred)
  return(sum(diag(pred_tab)) / nrow(test_x))
}

# initialize vectors
test_accs <- c()
train_accs <- c()
ns <- 1:25

# run knn model for different values of n
for(n in ns){
  test_accs[n] <- run_knn(train_x, train_y, test_x, test_y, n)
  train_accs[n] <- run_knn(train_x, train_y, train_x, train_y, n)
  print(n)
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25

# store in dataframe
preds_df <- data.frame(ns, test_accs, train_accs)
head(preds_df)

##   ns test_accs train_accs
## 1  1 0.9883268  1.0000000
## 2  2 0.9299611  0.9921875
## 3  3 0.7937743  0.9283854
## 4  4 0.7782101  0.8945312
## 5  5 0.7937743  0.8828125
## 6  6 0.8365759  0.8750000

Plot the results:

plt_data <- preds_df
colnames(plt_data) <- c("n", 'Test Set Accuracy', 'Train Set Accuracy')
plt_data <- melt(plt_data, id.vars = c('n'), 
                 variable.name = 'Dataset',
                 value.name = 'Accuracy')

ggplot(data=plt_data) +
  geom_line(aes(x=n, y=Accuracy, color=Dataset))

Surprisingly, the best performance from the test set is made using only 1 neighbor. However, this small value of \(n\) will likely produce a model that is very prone to the effect of noise or outliers. As such, the second highest value (when \(n=7\)) is probably a better choice.

Data 622 - Week 3 - kNN Models

William Jasmine

2024-02-17

Import and View Data

Clean and Prep Data

Make Inital KNN Predictions

Improve Model