Chapter 3: Classification using Nearest Neighbors
Example: Classifying Cancer Samples
Step 2: Exploring and preparing the data
# Import the CSV file
wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE)
# Examine the structure of the dataset
str(wbcd)
'data.frame': 569 obs. of 32 variables:
$ id : int 87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
$ diagnosis : chr "B" "B" "B" "B" ...
$ radius_mean : num 12.3 10.6 11 11.3 15.2 ...
$ texture_mean : num 12.4 18.9 16.8 13.4 13.2 ...
$ perimeter_mean : num 78.8 69.3 70.9 73 97.7 ...
$ area_mean : num 464 346 373 385 712 ...
$ smoothness_mean : num 0.1028 0.0969 0.1077 0.1164 0.0796 ...
$ compactness_mean : num 0.0698 0.1147 0.078 0.1136 0.0693 ...
$ concavity_mean : num 0.0399 0.0639 0.0305 0.0464 0.0339 ...
$ points_mean : num 0.037 0.0264 0.0248 0.048 0.0266 ...
$ symmetry_mean : num 0.196 0.192 0.171 0.177 0.172 ...
$ dimension_mean : num 0.0595 0.0649 0.0634 0.0607 0.0554 ...
$ radius_se : num 0.236 0.451 0.197 0.338 0.178 ...
$ texture_se : num 0.666 1.197 1.387 1.343 0.412 ...
$ perimeter_se : num 1.67 3.43 1.34 1.85 1.34 ...
$ area_se : num 17.4 27.1 13.5 26.3 17.7 ...
$ smoothness_se : num 0.00805 0.00747 0.00516 0.01127 0.00501 ...
$ compactness_se : num 0.0118 0.03581 0.00936 0.03498 0.01485 ...
$ concavity_se : num 0.0168 0.0335 0.0106 0.0219 0.0155 ...
$ points_se : num 0.01241 0.01365 0.00748 0.01965 0.00915 ...
$ symmetry_se : num 0.0192 0.035 0.0172 0.0158 0.0165 ...
$ dimension_se : num 0.00225 0.00332 0.0022 0.00344 0.00177 ...
$ radius_worst : num 13.5 11.9 12.4 11.9 16.2 ...
$ texture_worst : num 15.6 22.9 26.4 15.8 15.7 ...
$ perimeter_worst : num 87 78.3 79.9 76.5 104.5 ...
$ area_worst : num 549 425 471 434 819 ...
$ smoothness_worst : num 0.139 0.121 0.137 0.137 0.113 ...
$ compactness_worst: num 0.127 0.252 0.148 0.182 0.174 ...
$ concavity_worst : num 0.1242 0.1916 0.1067 0.0867 0.1362 ...
$ points_worst : num 0.0939 0.0793 0.0743 0.0861 0.0818 ...
$ symmetry_worst : num 0.283 0.294 0.3 0.21 0.249 ...
$ dimension_worst : num 0.0677 0.0759 0.0788 0.0678 0.0677 ...
# Drop the 'id' feature as it is not relevant for classification
wbcd <- wbcd[-1]
# Display frequency table of the diagnosis variable
table(wbcd$diagnosis)
B M
357 212
# Convert diagnosis variable to a factor with meaningful labels
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"),
labels = c("Benign", "Malignant"))
# Calculate and display proportions of each diagnosis class
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
Benign Malignant
62.7 37.3
# Summarize three selected numeric features
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])
radius_mean area_mean smoothness_mean
Min. : 6.981 Min. : 143.5 Min. :0.05263
1st Qu.:11.700 1st Qu.: 420.3 1st Qu.:0.08637
Median :13.370 Median : 551.1 Median :0.09587
Mean :14.127 Mean : 654.9 Mean :0.09636
3rd Qu.:15.780 3rd Qu.: 782.7 3rd Qu.:0.10530
Max. :28.110 Max. :2501.0 Max. :0.16340
# Define a normalization function to scale numeric values between 0 and 1
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
# Apply normalization to all numeric features (excluding the diagnosis label)
wbcd_norm <- as.data.frame(lapply(wbcd[2:31], normalize))
# Display summary of the normalized data
summary(wbcd_norm)
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.2233 1st Qu.:0.2185 1st Qu.:0.2168 1st Qu.:0.1174 1st Qu.:0.3046 1st Qu.:0.1397
Median :0.3024 Median :0.3088 Median :0.2933 Median :0.1729 Median :0.3904 Median :0.2247
Mean :0.3382 Mean :0.3240 Mean :0.3329 Mean :0.2169 Mean :0.3948 Mean :0.2606
3rd Qu.:0.4164 3rd Qu.:0.4089 3rd Qu.:0.4168 3rd Qu.:0.2711 3rd Qu.:0.4755 3rd Qu.:0.3405
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
concavity_mean points_mean symmetry_mean dimension_mean radius_se
Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
1st Qu.:0.06926 1st Qu.:0.1009 1st Qu.:0.2823 1st Qu.:0.1630 1st Qu.:0.04378
Median :0.14419 Median :0.1665 Median :0.3697 Median :0.2439 Median :0.07702
Mean :0.20806 Mean :0.2431 Mean :0.3796 Mean :0.2704 Mean :0.10635
3rd Qu.:0.30623 3rd Qu.:0.3678 3rd Qu.:0.4530 3rd Qu.:0.3404 3rd Qu.:0.13304
Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
texture_se perimeter_se area_se smoothness_se compactness_se
Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
1st Qu.:0.1047 1st Qu.:0.04000 1st Qu.:0.02064 1st Qu.:0.1175 1st Qu.:0.08132
Median :0.1653 Median :0.07209 Median :0.03311 Median :0.1586 Median :0.13667
Mean :0.1893 Mean :0.09938 Mean :0.06264 Mean :0.1811 Mean :0.17444
3rd Qu.:0.2462 3rd Qu.:0.12251 3rd Qu.:0.07170 3rd Qu.:0.2187 3rd Qu.:0.22680
Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
concavity_se points_se symmetry_se dimension_se radius_worst
Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
1st Qu.:0.03811 1st Qu.:0.1447 1st Qu.:0.1024 1st Qu.:0.04675 1st Qu.:0.1807
Median :0.06538 Median :0.2070 Median :0.1526 Median :0.07919 Median :0.2504
Mean :0.08054 Mean :0.2235 Mean :0.1781 Mean :0.10019 Mean :0.2967
3rd Qu.:0.10619 3rd Qu.:0.2787 3rd Qu.:0.2195 3rd Qu.:0.12656 3rd Qu.:0.3863
Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
texture_worst perimeter_worst area_worst smoothness_worst compactness_worst
Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
1st Qu.:0.2415 1st Qu.:0.1678 1st Qu.:0.08113 1st Qu.:0.3000 1st Qu.:0.1163
Median :0.3569 Median :0.2353 Median :0.12321 Median :0.3971 Median :0.1791
Mean :0.3640 Mean :0.2831 Mean :0.17091 Mean :0.4041 Mean :0.2202
3rd Qu.:0.4717 3rd Qu.:0.3735 3rd Qu.:0.22090 3rd Qu.:0.4942 3rd Qu.:0.3025
Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
concavity_worst points_worst symmetry_worst dimension_worst
Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.09145 1st Qu.:0.2231 1st Qu.:0.1851 1st Qu.:0.1077
Median :0.18107 Median :0.3434 Median :0.2478 Median :0.1640
Mean :0.21740 Mean :0.3938 Mean :0.2633 Mean :0.1896
3rd Qu.:0.30583 3rd Qu.:0.5546 3rd Qu.:0.3182 3rd Qu.:0.2429
Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
Splitting Data into Training and Testing Sets
# Split dataset into training (80%) and test (20%) sets
set.seed(123) # Ensures reproducibility
train_index <- sample(1:nrow(wbcd_norm), 0.8 * nrow(wbcd_norm))
wbcd_train <- wbcd_norm[train_index, ]
wbcd_test <- wbcd_norm[-train_index, ]
# Extract corresponding labels for training and testing
wbcd_train_labels <- wbcd$diagnosis[train_index]
wbcd_test_labels <- wbcd$diagnosis[-train_index]
Implementing k-NN and Finding the Ideal k
library(class)
library(caret)
Loading required package: ggplot2
Registered S3 method overwritten by 'data.table':
method from
print.data.table
# Function to find the optimal k value based on accuracy
tune_k <- function(k_values) {
accuracies <- c()
for (k in k_values) {
pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = k)
acc <- sum(pred == wbcd_test_labels) / length(wbcd_test_labels)
accuracies <- c(accuracies, acc)
}
return(data.frame(k = k_values, accuracy = accuracies))
}
# Test multiple k values to find the best
k_values <- seq(1, 20, by = 2) # Testing odd values from 1 to 20
results <- tune_k(k_values)
# Find the best k value
best_k <- results$k[which.max(results$accuracy)]
print(paste("Optimal k: ", best_k))
[1] "Optimal k: 11"
# Plot k values vs accuracy
library(ggplot2)
ggplot(results, aes(x = k, y = accuracy)) +
geom_line() +
geom_point() +
labs(title = "k-NN Accuracy vs. k Value", x = "Number of Neighbors (k)", y = "Accuracy")

Running the Final k-NN Model with Optimal k
# Train final model with the best k value
wbcd_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = best_k)
# Evaluate performance using confusion matrix
conf_matrix <- table(Predicted = wbcd_pred, Actual = wbcd_test_labels)
print(conf_matrix)
Actual
Predicted Benign Malignant
Benign 73 1
Malignant 1 39
# Calculate overall accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Final Model Accuracy: ", round(accuracy * 100, 2), "%"))
[1] "Final Model Accuracy: 98.25 %"
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMjIyMgQ2hhcHRlciAzOiBDbGFzc2lmaWNhdGlvbiB1c2luZyBOZWFyZXN0IE5laWdoYm9ycwoKIyMgRXhhbXBsZTogQ2xhc3NpZnlpbmcgQ2FuY2VyIFNhbXBsZXMKIyMgU3RlcCAyOiBFeHBsb3JpbmcgYW5kIHByZXBhcmluZyB0aGUgZGF0YQoKYGBge3J9CiMgSW1wb3J0IHRoZSBDU1YgZmlsZQp3YmNkIDwtIHJlYWQuY3N2KCJ3aXNjX2JjX2RhdGEuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFKQoKIyBFeGFtaW5lIHRoZSBzdHJ1Y3R1cmUgb2YgdGhlIGRhdGFzZXQKc3RyKHdiY2QpCmBgYAoKYGBge3J9CiMgRHJvcCB0aGUgJ2lkJyBmZWF0dXJlIGFzIGl0IGlzIG5vdCByZWxldmFudCBmb3IgY2xhc3NpZmljYXRpb24Kd2JjZCA8LSB3YmNkWy0xXQpgYGAKCmBgYHtyfQojIERpc3BsYXkgZnJlcXVlbmN5IHRhYmxlIG9mIHRoZSBkaWFnbm9zaXMgdmFyaWFibGUKdGFibGUod2JjZCRkaWFnbm9zaXMpCmBgYAoKYGBge3J9CiMgQ29udmVydCBkaWFnbm9zaXMgdmFyaWFibGUgdG8gYSBmYWN0b3Igd2l0aCBtZWFuaW5nZnVsIGxhYmVscwp3YmNkJGRpYWdub3NpcyA8LSBmYWN0b3Iod2JjZCRkaWFnbm9zaXMsIGxldmVscyA9IGMoIkIiLCAiTSIpLAogICAgICAgICAgICAgICAgICAgICAgICAgbGFiZWxzID0gYygiQmVuaWduIiwgIk1hbGlnbmFudCIpKQpgYGAKCmBgYHtyfQojIENhbGN1bGF0ZSBhbmQgZGlzcGxheSBwcm9wb3J0aW9ucyBvZiBlYWNoIGRpYWdub3NpcyBjbGFzcwpyb3VuZChwcm9wLnRhYmxlKHRhYmxlKHdiY2QkZGlhZ25vc2lzKSkgKiAxMDAsIGRpZ2l0cyA9IDEpCmBgYAoKYGBge3J9CiMgU3VtbWFyaXplIHRocmVlIHNlbGVjdGVkIG51bWVyaWMgZmVhdHVyZXMKc3VtbWFyeSh3YmNkW2MoInJhZGl1c19tZWFuIiwgImFyZWFfbWVhbiIsICJzbW9vdGhuZXNzX21lYW4iKV0pCmBgYAoKYGBge3J9CiMgRGVmaW5lIGEgbm9ybWFsaXphdGlvbiBmdW5jdGlvbiB0byBzY2FsZSBudW1lcmljIHZhbHVlcyBiZXR3ZWVuIDAgYW5kIDEKbm9ybWFsaXplIDwtIGZ1bmN0aW9uKHgpIHsKICByZXR1cm4gKCh4IC0gbWluKHgpKSAvIChtYXgoeCkgLSBtaW4oeCkpKQp9CmBgYAoKYGBge3J9CiMgQXBwbHkgbm9ybWFsaXphdGlvbiB0byBhbGwgbnVtZXJpYyBmZWF0dXJlcyAoZXhjbHVkaW5nIHRoZSBkaWFnbm9zaXMgbGFiZWwpCndiY2Rfbm9ybSA8LSBhcy5kYXRhLmZyYW1lKGxhcHBseSh3YmNkWzI6MzFdLCBub3JtYWxpemUpKQoKIyBEaXNwbGF5IHN1bW1hcnkgb2YgdGhlIG5vcm1hbGl6ZWQgZGF0YQpzdW1tYXJ5KHdiY2Rfbm9ybSkKYGBgCgojIyBTcGxpdHRpbmcgRGF0YSBpbnRvIFRyYWluaW5nIGFuZCBUZXN0aW5nIFNldHMKYGBge3J9CiMgU3BsaXQgZGF0YXNldCBpbnRvIHRyYWluaW5nICg4MCUpIGFuZCB0ZXN0ICgyMCUpIHNldHMKc2V0LnNlZWQoMTIzKSAjIEVuc3VyZXMgcmVwcm9kdWNpYmlsaXR5CnRyYWluX2luZGV4IDwtIHNhbXBsZSgxOm5yb3cod2JjZF9ub3JtKSwgMC44ICogbnJvdyh3YmNkX25vcm0pKQp3YmNkX3RyYWluIDwtIHdiY2Rfbm9ybVt0cmFpbl9pbmRleCwgXQp3YmNkX3Rlc3QgPC0gd2JjZF9ub3JtWy10cmFpbl9pbmRleCwgXQoKIyBFeHRyYWN0IGNvcnJlc3BvbmRpbmcgbGFiZWxzIGZvciB0cmFpbmluZyBhbmQgdGVzdGluZwp3YmNkX3RyYWluX2xhYmVscyA8LSB3YmNkJGRpYWdub3Npc1t0cmFpbl9pbmRleF0Kd2JjZF90ZXN0X2xhYmVscyA8LSB3YmNkJGRpYWdub3Npc1stdHJhaW5faW5kZXhdCmBgYAoKIyMgSW1wbGVtZW50aW5nIGstTk4gYW5kIEZpbmRpbmcgdGhlIElkZWFsIGsKYGBge3J9CmxpYnJhcnkoY2xhc3MpCmxpYnJhcnkoY2FyZXQpCgojIEZ1bmN0aW9uIHRvIGZpbmQgdGhlIG9wdGltYWwgayB2YWx1ZSBiYXNlZCBvbiBhY2N1cmFjeQp0dW5lX2sgPC0gZnVuY3Rpb24oa192YWx1ZXMpIHsKICBhY2N1cmFjaWVzIDwtIGMoKQogIAogIGZvciAoayBpbiBrX3ZhbHVlcykgewogICAgcHJlZCA8LSBrbm4odHJhaW4gPSB3YmNkX3RyYWluLCB0ZXN0ID0gd2JjZF90ZXN0LCBjbCA9IHdiY2RfdHJhaW5fbGFiZWxzLCBrID0gaykKICAgIGFjYyA8LSBzdW0ocHJlZCA9PSB3YmNkX3Rlc3RfbGFiZWxzKSAvIGxlbmd0aCh3YmNkX3Rlc3RfbGFiZWxzKQogICAgYWNjdXJhY2llcyA8LSBjKGFjY3VyYWNpZXMsIGFjYykKICB9CiAgCiAgcmV0dXJuKGRhdGEuZnJhbWUoayA9IGtfdmFsdWVzLCBhY2N1cmFjeSA9IGFjY3VyYWNpZXMpKQp9CgojIFRlc3QgbXVsdGlwbGUgayB2YWx1ZXMgdG8gZmluZCB0aGUgYmVzdAprX3ZhbHVlcyA8LSBzZXEoMSwgMjAsIGJ5ID0gMikgIyBUZXN0aW5nIG9kZCB2YWx1ZXMgZnJvbSAxIHRvIDIwCnJlc3VsdHMgPC0gdHVuZV9rKGtfdmFsdWVzKQoKIyBGaW5kIHRoZSBiZXN0IGsgdmFsdWUKYmVzdF9rIDwtIHJlc3VsdHMka1t3aGljaC5tYXgocmVzdWx0cyRhY2N1cmFjeSldCnByaW50KHBhc3RlKCJPcHRpbWFsIGs6ICIsIGJlc3RfaykpCgojIFBsb3QgayB2YWx1ZXMgdnMgYWNjdXJhY3kKbGlicmFyeShnZ3Bsb3QyKQpnZ3Bsb3QocmVzdWx0cywgYWVzKHggPSBrLCB5ID0gYWNjdXJhY3kpKSArCiAgZ2VvbV9saW5lKCkgKwogIGdlb21fcG9pbnQoKSArCiAgbGFicyh0aXRsZSA9ICJrLU5OIEFjY3VyYWN5IHZzLiBrIFZhbHVlIiwgeCA9ICJOdW1iZXIgb2YgTmVpZ2hib3JzIChrKSIsIHkgPSAiQWNjdXJhY3kiKQpgYGAKCiMjIFJ1bm5pbmcgdGhlIEZpbmFsIGstTk4gTW9kZWwgd2l0aCBPcHRpbWFsIGsKYGBge3J9CiMgVHJhaW4gZmluYWwgbW9kZWwgd2l0aCB0aGUgYmVzdCBrIHZhbHVlCndiY2RfcHJlZCA8LSBrbm4odHJhaW4gPSB3YmNkX3RyYWluLCB0ZXN0ID0gd2JjZF90ZXN0LCBjbCA9IHdiY2RfdHJhaW5fbGFiZWxzLCBrID0gYmVzdF9rKQoKIyBFdmFsdWF0ZSBwZXJmb3JtYW5jZSB1c2luZyBjb25mdXNpb24gbWF0cml4CmNvbmZfbWF0cml4IDwtIHRhYmxlKFByZWRpY3RlZCA9IHdiY2RfcHJlZCwgQWN0dWFsID0gd2JjZF90ZXN0X2xhYmVscykKcHJpbnQoY29uZl9tYXRyaXgpCgojIENhbGN1bGF0ZSBvdmVyYWxsIGFjY3VyYWN5CmFjY3VyYWN5IDwtIHN1bShkaWFnKGNvbmZfbWF0cml4KSkgLyBzdW0oY29uZl9tYXRyaXgpCnByaW50KHBhc3RlKCJGaW5hbCBNb2RlbCBBY2N1cmFjeTogIiwgcm91bmQoYWNjdXJhY3kgKiAxMDAsIDIpLCAiJSIpKQpgYGA=