heart <- read.csv("heart.csv")
str(heart)
## 'data.frame': 303 obs. of 14 variables:
## $ ï..age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : int 1 1 0 1 0 1 0 1 1 1 ...
## $ cp : int 3 2 1 1 0 0 1 1 2 2 ...
## $ trestbps: int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : int 1 0 0 0 0 0 0 0 1 0 ...
## $ restecg : int 0 1 0 1 1 1 0 1 1 1 ...
## $ thalach : int 150 187 172 178 163 148 153 173 162 174 ...
## $ exang : int 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : int 0 0 2 2 2 1 1 2 2 2 ...
## $ ca : int 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : int 1 2 2 2 2 1 2 3 3 2 ...
## $ target : int 1 1 1 1 1 1 1 1 1 1 ...
age: The person’s age in years sex: The person’s sex (1 = male, 0 = female) cp: The chest pain experienced (Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain, Value 4: asymptomatic) trestbps: The person’s resting blood pressure (mm Hg on admission to the hospital) chol: The person’s cholesterol measurement in mg/dl fbs: The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false) restecg: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes’ criteria) thalach: The person’s maximum heart rate achieved exang: Exercise induced angina (1 = yes; 0 = no) oldpeak: ST depression induced by exercise relative to rest (‘ST’ relates to positions on the ECG plot. See more here) slope: the slope of the peak exercise ST segment (Value 1: upsloping, Value 2: flat, Value 3: downsloping) ca: The number of major vessels (0-3) thal: A blood disorder called thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect) target: Heart disease (0 = no, 1 = yes)
heart <- heart %>%
mutate(age = as.numeric(ï..age),
cp = as.factor(cp),
sex = factor(sex, levels = c(0,1),
labels = c("Female", "Male")),
fbs = as.factor(fbs),
restecg = as.factor(restecg),
exang = as.factor(exang),
target = factor(target, levels = c(0,1),
labels = c("Healthy", "Unhealthy")))
heart <- heart[,-c(1)]
glimpse(heart)
## Rows: 303
## Columns: 14
## $ sex <fct> Male, Male, Female, Male, Female, Male, Female, Male, Male, M~
## $ cp <fct> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3, 0~
## $ trestbps <int> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130, 1~
## $ chol <int> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275, 2~
## $ fbs <fct> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0~
## $ restecg <fct> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1~
## $ thalach <int> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139, 1~
## $ exang <fct> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0~
## $ oldpeak <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2, 0~
## $ slope <int> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2, 1~
## $ ca <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0~
## $ thal <int> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3~
## $ target <fct> Unhealthy, Unhealthy, Unhealthy, Unhealthy, Unhealthy, Unheal~
## $ age <dbl> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58, 5~
colSums(is.na(heart))
## sex cp trestbps chol fbs restecg thalach exang
## 0 0 0 0 0 0 0 0
## oldpeak slope ca thal target age
## 0 0 0 0 0 0
prop.table(table(heart$target))
##
## Healthy Unhealthy
## 0.4554455 0.5445545
table(heart$target)
##
## Healthy Unhealthy
## 138 165
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
intrain <- sample(nrow(heart), nrow(heart)*0.8)
heart_train <- heart[intrain,]
heart_test <- heart[-intrain,]
heart$target %>%
levels()
## [1] "Healthy" "Unhealthy"
heart_train_x <- heart_train %>% select_if(is.numeric)
heart_test_x <- heart_test %>% select_if(is.numeric)
heart_test_x <- heart_test_x[,-c(9)]
heart_train_y <- heart_train[,13]
heart_test_y <- heart_test[,13]
heart_train_xs <- scale(x = heart_train_x)
heart_test_xs <- scale(x = heart_test_x,
center = attr(heart_train_xs, "scaled:center"),
scale = attr(heart_train_xs, "scaled:scale"))
round(sqrt(nrow(heart_train)))
## [1] 16
knn_model <- knn(train = heart_train_xs,
test = heart_test_xs,
cl = heart_train_y,
k = 16)
head(knn_model)
## [1] Healthy Unhealthy Unhealthy Unhealthy Unhealthy Unhealthy
## Levels: Healthy Unhealthy
confusionMatrix(data = knn_model,
reference = heart_test_y,
positive = "Unhealthy")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Healthy Unhealthy
## Healthy 22 4
## Unhealthy 9 26
##
## Accuracy : 0.7869
## 95% CI : (0.6632, 0.8814)
## No Information Rate : 0.5082
## P-Value [Acc > NIR] : 6.823e-06
##
## Kappa : 0.5748
##
## Mcnemar's Test P-Value : 0.2673
##
## Sensitivity : 0.8667
## Specificity : 0.7097
## Pos Pred Value : 0.7429
## Neg Pred Value : 0.8462
## Prevalence : 0.4918
## Detection Rate : 0.4262
## Detection Prevalence : 0.5738
## Balanced Accuracy : 0.7882
##
## 'Positive' Class : Unhealthy
##
Accuracy2 <- round((26+22)/(22+4+9+26),2)
Recall2 <- round((26)/(26+4),2)
Precision2 <- round((26)/(26+9),2)
Specificity2 <- round((22)/(22+9),2)
knn_perf <- cbind.data.frame(Accuracy2, Recall2, Precision2, Specificity2)
knn_perf
## Accuracy2 Recall2 Precision2 Specificity2
## 1 0.79 0.87 0.74 0.71