Import heart disease dataset:
heart <- read_csv("heart.csv", col_types = "nffnnffnfnfnff")
glimpse(heart)
## Rows: 1,025
## Columns: 14
## $ age <dbl> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51, 52, 3…
## $ sex <fct> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1…
## $ cp <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 2…
## $ trestbps <dbl> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112, 132, 1…
## $ chol <dbl> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149, 341, 2…
## $ fbs <fct> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ restecg <fct> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0…
## $ thalach <dbl> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125, 136, 1…
## $ exang <fct> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0…
## $ oldpeak <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6, 3.0, 0…
## $ slope <fct> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1…
## $ ca <dbl> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0…
## $ thal <fct> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2, 2, 2…
## $ target <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0…
Show a summary of the data:
summary(heart)
## age sex cp trestbps chol fbs restecg
## Min. :29.00 1:713 0:497 Min. : 94.0 Min. :126 0:872 1:513
## 1st Qu.:48.00 0:312 1:167 1st Qu.:120.0 1st Qu.:211 1:153 0:497
## Median :56.00 2:284 Median :130.0 Median :240 2: 15
## Mean :54.43 3: 77 Mean :131.6 Mean :246
## 3rd Qu.:61.00 3rd Qu.:140.0 3rd Qu.:275
## Max. :77.00 Max. :200.0 Max. :564
## thalach exang oldpeak slope ca thal
## Min. : 71.0 0:680 Min. :0.000 2:469 Min. :0.0000 3:410
## 1st Qu.:132.0 1:345 1st Qu.:0.000 0: 74 1st Qu.:0.0000 2:544
## Median :152.0 Median :0.800 1:482 Median :0.0000 1: 64
## Mean :149.1 Mean :1.072 Mean :0.7541 0: 7
## 3rd Qu.:166.0 3rd Qu.:1.800 3rd Qu.:1.0000
## Max. :202.0 Max. :6.200 Max. :4.0000
## target
## 0:499
## 1:526
##
##
##
##
Check for missing data:
which(is.na(heart))
## integer(0)
Normalize the data:
# create function that scales data from 0 to 1
min_max_scaler <- function(x) {
# x should be numeric vector
return((x - min(x)) / (max(x) - min(x)))
}
numeric_fields <- colnames(select_if(heart, is.numeric))
heart <- heart %>%
mutate(across(all_of(numeric_fields), min_max_scaler))
# can also use the caret library
# process <- preProcess(heart, method=c("range"))
# data <- predict(process, heart)
summary(heart)
## age sex cp trestbps chol fbs
## Min. :0.0000 1:713 0:497 Min. :0.0000 Min. :0.0000 0:872
## 1st Qu.:0.3958 0:312 1:167 1st Qu.:0.2453 1st Qu.:0.1941 1:153
## Median :0.5625 2:284 Median :0.3396 Median :0.2603
## Mean :0.5299 3: 77 Mean :0.3548 Mean :0.2740
## 3rd Qu.:0.6667 3rd Qu.:0.4340 3rd Qu.:0.3402
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## restecg thalach exang oldpeak slope ca
## 1:513 Min. :0.0000 0:680 Min. :0.0000 2:469 Min. :0.0000
## 0:497 1st Qu.:0.4656 1:345 1st Qu.:0.0000 0: 74 1st Qu.:0.0000
## 2: 15 Median :0.6183 Median :0.1290 1:482 Median :0.0000
## Mean :0.5963 Mean :0.1728 Mean :0.1885
## 3rd Qu.:0.7252 3rd Qu.:0.2903 3rd Qu.:0.2500
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## thal target
## 3:410 0:499
## 2:544 1:526
## 1: 64
## 0: 7
##
##
Normalization is important while using the KNN model due to the fact that features with a wider range of values can have a disproportionate impact on Euclidean distance calculations (which is the main calculation implemented by the KNN model). The above confirms all numeric fields have a min of 0 and a max of 1.
Next, we create dummy variables for the categorical predictor fields:
categorical_fields <- head(colnames(select_if(heart, is.factor)), -1)
heart <- dummy_cols(heart,
select_columns = categorical_fields,
remove_first_dummy = TRUE,
remove_selected_columns = TRUE)
Split the the data into a training and testing set:
# define training data cutoff
train_size = 0.75
cutoff = floor(train_size * nrow(heart))
# split data randomly into train and test set
set.seed(1234)
train_ind <- sample(seq_len(nrow(heart)), size=cutoff)
train <- heart[train_ind, ]
test <- heart[-train_ind, ]
# separate into target and predictors
train_x <- select(train, -target)
train_y <- select(train, target)$target
test_x <- select(test, -target)
test_y <- select(test, target)$target
Make predictions using the KNN model (without any tuning):
heart_pred_inital <-
knn(
train = train_x,
test = test_x,
cl = train_y,
k = 5
)
head(heart_pred_inital)
## [1] 1 0 0 1 0 1
## Levels: 0 1
Make a confusion matrix showing how the model performed on the test data:
pred_inital_tab <- table(test_y, heart_pred_inital)
pred_inital_tab
## heart_pred_inital
## test_y 0 1
## 0 105 28
## 1 28 96
sum(diag(pred_inital_tab)) / nrow(test_x)
## [1] 0.7821012
Initial model has an accuracy of ~78% using 5 neighbors.
The only available parameter that can be tuned using the KNN model is the number of neighbors \(n\).
Evaluate train and test accuracy using different values of \(n\):
# create function to repeatedly run KNN model
run_knn <- function(train_x, train_y, test_x, test_y, n) {
pred <-
knn(
train = train_x,
test = test_x,
cl = train_y,
k = n
)
pred_tab <- table(test_y, pred)
return(sum(diag(pred_tab)) / nrow(test_x))
}
# initialize vectors
test_accs <- c()
train_accs <- c()
ns <- 1:25
# run knn model for different values of n
for(n in ns){
test_accs[n] <- run_knn(train_x, train_y, test_x, test_y, n)
train_accs[n] <- run_knn(train_x, train_y, train_x, train_y, n)
print(n)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
# store in dataframe
preds_df <- data.frame(ns, test_accs, train_accs)
head(preds_df)
## ns test_accs train_accs
## 1 1 0.9883268 1.0000000
## 2 2 0.9299611 0.9921875
## 3 3 0.7937743 0.9283854
## 4 4 0.7782101 0.8945312
## 5 5 0.7937743 0.8828125
## 6 6 0.8365759 0.8750000
Plot the results:
plt_data <- preds_df
colnames(plt_data) <- c("n", 'Test Set Accuracy', 'Train Set Accuracy')
plt_data <- melt(plt_data, id.vars = c('n'),
variable.name = 'Dataset',
value.name = 'Accuracy')
ggplot(data=plt_data) +
geom_line(aes(x=n, y=Accuracy, color=Dataset))
Surprisingly, the best performance from the test set is made using only 1 neighbor. However, this small value of \(n\) will likely produce a model that is very prone to the effect of noise or outliers. As such, the second highest value (when \(n=7\)) is probably a better choice.