Chapter 3: Classification using Nearest Neighbors

Example: Classifying Cancer Samples

Step 2: Exploring and preparing the data

# Import the CSV file
wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE)

# Examine the structure of the dataset
str(wbcd)
'data.frame':   569 obs. of  32 variables:
 $ id               : int  87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
 $ diagnosis        : chr  "B" "B" "B" "B" ...
 $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
 $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
 $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
 $ area_mean        : num  464 346 373 385 712 ...
 $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
 $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
 $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
 $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
 $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
 $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
 $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
 $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
 $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
 $ area_se          : num  17.4 27.1 13.5 26.3 17.7 ...
 $ smoothness_se    : num  0.00805 0.00747 0.00516 0.01127 0.00501 ...
 $ compactness_se   : num  0.0118 0.03581 0.00936 0.03498 0.01485 ...
 $ concavity_se     : num  0.0168 0.0335 0.0106 0.0219 0.0155 ...
 $ points_se        : num  0.01241 0.01365 0.00748 0.01965 0.00915 ...
 $ symmetry_se      : num  0.0192 0.035 0.0172 0.0158 0.0165 ...
 $ dimension_se     : num  0.00225 0.00332 0.0022 0.00344 0.00177 ...
 $ radius_worst     : num  13.5 11.9 12.4 11.9 16.2 ...
 $ texture_worst    : num  15.6 22.9 26.4 15.8 15.7 ...
 $ perimeter_worst  : num  87 78.3 79.9 76.5 104.5 ...
 $ area_worst       : num  549 425 471 434 819 ...
 $ smoothness_worst : num  0.139 0.121 0.137 0.137 0.113 ...
 $ compactness_worst: num  0.127 0.252 0.148 0.182 0.174 ...
 $ concavity_worst  : num  0.1242 0.1916 0.1067 0.0867 0.1362 ...
 $ points_worst     : num  0.0939 0.0793 0.0743 0.0861 0.0818 ...
 $ symmetry_worst   : num  0.283 0.294 0.3 0.21 0.249 ...
 $ dimension_worst  : num  0.0677 0.0759 0.0788 0.0678 0.0677 ...
# Drop the 'id' feature as it is not relevant for classification
wbcd <- wbcd[-1]
# Display frequency table of the diagnosis variable
table(wbcd$diagnosis)

  B   M 
357 212 
# Convert diagnosis variable to a factor with meaningful labels
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"),
                         labels = c("Benign", "Malignant"))
# Calculate and display proportions of each diagnosis class
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)

   Benign Malignant 
     62.7      37.3 
# Summarize three selected numeric features
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])
  radius_mean       area_mean      smoothness_mean  
 Min.   : 6.981   Min.   : 143.5   Min.   :0.05263  
 1st Qu.:11.700   1st Qu.: 420.3   1st Qu.:0.08637  
 Median :13.370   Median : 551.1   Median :0.09587  
 Mean   :14.127   Mean   : 654.9   Mean   :0.09636  
 3rd Qu.:15.780   3rd Qu.: 782.7   3rd Qu.:0.10530  
 Max.   :28.110   Max.   :2501.0   Max.   :0.16340  
# Define a normalization function to scale numeric values between 0 and 1
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}
# Apply normalization to all numeric features (excluding the diagnosis label)
wbcd_norm <- as.data.frame(lapply(wbcd[2:31], normalize))

# Display summary of the normalized data
summary(wbcd_norm)
  radius_mean      texture_mean    perimeter_mean     area_mean      smoothness_mean  compactness_mean
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174   1st Qu.:0.3046   1st Qu.:0.1397  
 Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729   Median :0.3904   Median :0.2247  
 Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169   Mean   :0.3948   Mean   :0.2606  
 3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711   3rd Qu.:0.4755   3rd Qu.:0.3405  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
 concavity_mean     points_mean     symmetry_mean    dimension_mean     radius_se      
 Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
 1st Qu.:0.06926   1st Qu.:0.1009   1st Qu.:0.2823   1st Qu.:0.1630   1st Qu.:0.04378  
 Median :0.14419   Median :0.1665   Median :0.3697   Median :0.2439   Median :0.07702  
 Mean   :0.20806   Mean   :0.2431   Mean   :0.3796   Mean   :0.2704   Mean   :0.10635  
 3rd Qu.:0.30623   3rd Qu.:0.3678   3rd Qu.:0.4530   3rd Qu.:0.3404   3rd Qu.:0.13304  
 Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
   texture_se      perimeter_se        area_se        smoothness_se    compactness_se   
 Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
 1st Qu.:0.1047   1st Qu.:0.04000   1st Qu.:0.02064   1st Qu.:0.1175   1st Qu.:0.08132  
 Median :0.1653   Median :0.07209   Median :0.03311   Median :0.1586   Median :0.13667  
 Mean   :0.1893   Mean   :0.09938   Mean   :0.06264   Mean   :0.1811   Mean   :0.17444  
 3rd Qu.:0.2462   3rd Qu.:0.12251   3rd Qu.:0.07170   3rd Qu.:0.2187   3rd Qu.:0.22680  
 Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
  concavity_se       points_se       symmetry_se      dimension_se      radius_worst   
 Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
 1st Qu.:0.03811   1st Qu.:0.1447   1st Qu.:0.1024   1st Qu.:0.04675   1st Qu.:0.1807  
 Median :0.06538   Median :0.2070   Median :0.1526   Median :0.07919   Median :0.2504  
 Mean   :0.08054   Mean   :0.2235   Mean   :0.1781   Mean   :0.10019   Mean   :0.2967  
 3rd Qu.:0.10619   3rd Qu.:0.2787   3rd Qu.:0.2195   3rd Qu.:0.12656   3rd Qu.:0.3863  
 Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
 texture_worst    perimeter_worst    area_worst      smoothness_worst compactness_worst
 Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   
 1st Qu.:0.2415   1st Qu.:0.1678   1st Qu.:0.08113   1st Qu.:0.3000   1st Qu.:0.1163   
 Median :0.3569   Median :0.2353   Median :0.12321   Median :0.3971   Median :0.1791   
 Mean   :0.3640   Mean   :0.2831   Mean   :0.17091   Mean   :0.4041   Mean   :0.2202   
 3rd Qu.:0.4717   3rd Qu.:0.3735   3rd Qu.:0.22090   3rd Qu.:0.4942   3rd Qu.:0.3025   
 Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   
 concavity_worst    points_worst    symmetry_worst   dimension_worst 
 Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.09145   1st Qu.:0.2231   1st Qu.:0.1851   1st Qu.:0.1077  
 Median :0.18107   Median :0.3434   Median :0.2478   Median :0.1640  
 Mean   :0.21740   Mean   :0.3938   Mean   :0.2633   Mean   :0.1896  
 3rd Qu.:0.30583   3rd Qu.:0.5546   3rd Qu.:0.3182   3rd Qu.:0.2429  
 Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  

Splitting Data into Training and Testing Sets

# Split dataset into training (80%) and test (20%) sets
set.seed(123) # Ensures reproducibility
train_index <- sample(1:nrow(wbcd_norm), 0.8 * nrow(wbcd_norm))
wbcd_train <- wbcd_norm[train_index, ]
wbcd_test <- wbcd_norm[-train_index, ]

# Extract corresponding labels for training and testing
wbcd_train_labels <- wbcd$diagnosis[train_index]
wbcd_test_labels <- wbcd$diagnosis[-train_index]

Implementing k-NN and Finding the Ideal k

library(class)
library(caret)
Loading required package: ggplot2
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
# Function to find the optimal k value based on accuracy
tune_k <- function(k_values) {
  accuracies <- c()
  
  for (k in k_values) {
    pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = k)
    acc <- sum(pred == wbcd_test_labels) / length(wbcd_test_labels)
    accuracies <- c(accuracies, acc)
  }
  
  return(data.frame(k = k_values, accuracy = accuracies))
}

# Test multiple k values to find the best
k_values <- seq(1, 20, by = 2) # Testing odd values from 1 to 20
results <- tune_k(k_values)

# Find the best k value
best_k <- results$k[which.max(results$accuracy)]
print(paste("Optimal k: ", best_k))
[1] "Optimal k:  11"
# Plot k values vs accuracy
library(ggplot2)
ggplot(results, aes(x = k, y = accuracy)) +
  geom_line() +
  geom_point() +
  labs(title = "k-NN Accuracy vs. k Value", x = "Number of Neighbors (k)", y = "Accuracy")

Running the Final k-NN Model with Optimal k

# Train final model with the best k value
wbcd_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = best_k)

# Evaluate performance using confusion matrix
conf_matrix <- table(Predicted = wbcd_pred, Actual = wbcd_test_labels)
print(conf_matrix)
           Actual
Predicted   Benign Malignant
  Benign        73         1
  Malignant      1        39
# Calculate overall accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Final Model Accuracy: ", round(accuracy * 100, 2), "%"))
[1] "Final Model Accuracy:  98.25 %"
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMjIyMgQ2hhcHRlciAzOiBDbGFzc2lmaWNhdGlvbiB1c2luZyBOZWFyZXN0IE5laWdoYm9ycwoKIyMgRXhhbXBsZTogQ2xhc3NpZnlpbmcgQ2FuY2VyIFNhbXBsZXMKIyMgU3RlcCAyOiBFeHBsb3JpbmcgYW5kIHByZXBhcmluZyB0aGUgZGF0YQoKYGBge3J9CiMgSW1wb3J0IHRoZSBDU1YgZmlsZQp3YmNkIDwtIHJlYWQuY3N2KCJ3aXNjX2JjX2RhdGEuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFKQoKIyBFeGFtaW5lIHRoZSBzdHJ1Y3R1cmUgb2YgdGhlIGRhdGFzZXQKc3RyKHdiY2QpCmBgYAoKYGBge3J9CiMgRHJvcCB0aGUgJ2lkJyBmZWF0dXJlIGFzIGl0IGlzIG5vdCByZWxldmFudCBmb3IgY2xhc3NpZmljYXRpb24Kd2JjZCA8LSB3YmNkWy0xXQpgYGAKCmBgYHtyfQojIERpc3BsYXkgZnJlcXVlbmN5IHRhYmxlIG9mIHRoZSBkaWFnbm9zaXMgdmFyaWFibGUKdGFibGUod2JjZCRkaWFnbm9zaXMpCmBgYAoKYGBge3J9CiMgQ29udmVydCBkaWFnbm9zaXMgdmFyaWFibGUgdG8gYSBmYWN0b3Igd2l0aCBtZWFuaW5nZnVsIGxhYmVscwp3YmNkJGRpYWdub3NpcyA8LSBmYWN0b3Iod2JjZCRkaWFnbm9zaXMsIGxldmVscyA9IGMoIkIiLCAiTSIpLAogICAgICAgICAgICAgICAgICAgICAgICAgbGFiZWxzID0gYygiQmVuaWduIiwgIk1hbGlnbmFudCIpKQpgYGAKCmBgYHtyfQojIENhbGN1bGF0ZSBhbmQgZGlzcGxheSBwcm9wb3J0aW9ucyBvZiBlYWNoIGRpYWdub3NpcyBjbGFzcwpyb3VuZChwcm9wLnRhYmxlKHRhYmxlKHdiY2QkZGlhZ25vc2lzKSkgKiAxMDAsIGRpZ2l0cyA9IDEpCmBgYAoKYGBge3J9CiMgU3VtbWFyaXplIHRocmVlIHNlbGVjdGVkIG51bWVyaWMgZmVhdHVyZXMKc3VtbWFyeSh3YmNkW2MoInJhZGl1c19tZWFuIiwgImFyZWFfbWVhbiIsICJzbW9vdGhuZXNzX21lYW4iKV0pCmBgYAoKYGBge3J9CiMgRGVmaW5lIGEgbm9ybWFsaXphdGlvbiBmdW5jdGlvbiB0byBzY2FsZSBudW1lcmljIHZhbHVlcyBiZXR3ZWVuIDAgYW5kIDEKbm9ybWFsaXplIDwtIGZ1bmN0aW9uKHgpIHsKICByZXR1cm4gKCh4IC0gbWluKHgpKSAvIChtYXgoeCkgLSBtaW4oeCkpKQp9CmBgYAoKYGBge3J9CiMgQXBwbHkgbm9ybWFsaXphdGlvbiB0byBhbGwgbnVtZXJpYyBmZWF0dXJlcyAoZXhjbHVkaW5nIHRoZSBkaWFnbm9zaXMgbGFiZWwpCndiY2Rfbm9ybSA8LSBhcy5kYXRhLmZyYW1lKGxhcHBseSh3YmNkWzI6MzFdLCBub3JtYWxpemUpKQoKIyBEaXNwbGF5IHN1bW1hcnkgb2YgdGhlIG5vcm1hbGl6ZWQgZGF0YQpzdW1tYXJ5KHdiY2Rfbm9ybSkKYGBgCgojIyBTcGxpdHRpbmcgRGF0YSBpbnRvIFRyYWluaW5nIGFuZCBUZXN0aW5nIFNldHMKYGBge3J9CiMgU3BsaXQgZGF0YXNldCBpbnRvIHRyYWluaW5nICg4MCUpIGFuZCB0ZXN0ICgyMCUpIHNldHMKc2V0LnNlZWQoMTIzKSAjIEVuc3VyZXMgcmVwcm9kdWNpYmlsaXR5CnRyYWluX2luZGV4IDwtIHNhbXBsZSgxOm5yb3cod2JjZF9ub3JtKSwgMC44ICogbnJvdyh3YmNkX25vcm0pKQp3YmNkX3RyYWluIDwtIHdiY2Rfbm9ybVt0cmFpbl9pbmRleCwgXQp3YmNkX3Rlc3QgPC0gd2JjZF9ub3JtWy10cmFpbl9pbmRleCwgXQoKIyBFeHRyYWN0IGNvcnJlc3BvbmRpbmcgbGFiZWxzIGZvciB0cmFpbmluZyBhbmQgdGVzdGluZwp3YmNkX3RyYWluX2xhYmVscyA8LSB3YmNkJGRpYWdub3Npc1t0cmFpbl9pbmRleF0Kd2JjZF90ZXN0X2xhYmVscyA8LSB3YmNkJGRpYWdub3Npc1stdHJhaW5faW5kZXhdCmBgYAoKIyMgSW1wbGVtZW50aW5nIGstTk4gYW5kIEZpbmRpbmcgdGhlIElkZWFsIGsKYGBge3J9CmxpYnJhcnkoY2xhc3MpCmxpYnJhcnkoY2FyZXQpCgojIEZ1bmN0aW9uIHRvIGZpbmQgdGhlIG9wdGltYWwgayB2YWx1ZSBiYXNlZCBvbiBhY2N1cmFjeQp0dW5lX2sgPC0gZnVuY3Rpb24oa192YWx1ZXMpIHsKICBhY2N1cmFjaWVzIDwtIGMoKQogIAogIGZvciAoayBpbiBrX3ZhbHVlcykgewogICAgcHJlZCA8LSBrbm4odHJhaW4gPSB3YmNkX3RyYWluLCB0ZXN0ID0gd2JjZF90ZXN0LCBjbCA9IHdiY2RfdHJhaW5fbGFiZWxzLCBrID0gaykKICAgIGFjYyA8LSBzdW0ocHJlZCA9PSB3YmNkX3Rlc3RfbGFiZWxzKSAvIGxlbmd0aCh3YmNkX3Rlc3RfbGFiZWxzKQogICAgYWNjdXJhY2llcyA8LSBjKGFjY3VyYWNpZXMsIGFjYykKICB9CiAgCiAgcmV0dXJuKGRhdGEuZnJhbWUoayA9IGtfdmFsdWVzLCBhY2N1cmFjeSA9IGFjY3VyYWNpZXMpKQp9CgojIFRlc3QgbXVsdGlwbGUgayB2YWx1ZXMgdG8gZmluZCB0aGUgYmVzdAprX3ZhbHVlcyA8LSBzZXEoMSwgMjAsIGJ5ID0gMikgIyBUZXN0aW5nIG9kZCB2YWx1ZXMgZnJvbSAxIHRvIDIwCnJlc3VsdHMgPC0gdHVuZV9rKGtfdmFsdWVzKQoKIyBGaW5kIHRoZSBiZXN0IGsgdmFsdWUKYmVzdF9rIDwtIHJlc3VsdHMka1t3aGljaC5tYXgocmVzdWx0cyRhY2N1cmFjeSldCnByaW50KHBhc3RlKCJPcHRpbWFsIGs6ICIsIGJlc3RfaykpCgojIFBsb3QgayB2YWx1ZXMgdnMgYWNjdXJhY3kKbGlicmFyeShnZ3Bsb3QyKQpnZ3Bsb3QocmVzdWx0cywgYWVzKHggPSBrLCB5ID0gYWNjdXJhY3kpKSArCiAgZ2VvbV9saW5lKCkgKwogIGdlb21fcG9pbnQoKSArCiAgbGFicyh0aXRsZSA9ICJrLU5OIEFjY3VyYWN5IHZzLiBrIFZhbHVlIiwgeCA9ICJOdW1iZXIgb2YgTmVpZ2hib3JzIChrKSIsIHkgPSAiQWNjdXJhY3kiKQpgYGAKCiMjIFJ1bm5pbmcgdGhlIEZpbmFsIGstTk4gTW9kZWwgd2l0aCBPcHRpbWFsIGsKYGBge3J9CiMgVHJhaW4gZmluYWwgbW9kZWwgd2l0aCB0aGUgYmVzdCBrIHZhbHVlCndiY2RfcHJlZCA8LSBrbm4odHJhaW4gPSB3YmNkX3RyYWluLCB0ZXN0ID0gd2JjZF90ZXN0LCBjbCA9IHdiY2RfdHJhaW5fbGFiZWxzLCBrID0gYmVzdF9rKQoKIyBFdmFsdWF0ZSBwZXJmb3JtYW5jZSB1c2luZyBjb25mdXNpb24gbWF0cml4CmNvbmZfbWF0cml4IDwtIHRhYmxlKFByZWRpY3RlZCA9IHdiY2RfcHJlZCwgQWN0dWFsID0gd2JjZF90ZXN0X2xhYmVscykKcHJpbnQoY29uZl9tYXRyaXgpCgojIENhbGN1bGF0ZSBvdmVyYWxsIGFjY3VyYWN5CmFjY3VyYWN5IDwtIHN1bShkaWFnKGNvbmZfbWF0cml4KSkgLyBzdW0oY29uZl9tYXRyaXgpCnByaW50KHBhc3RlKCJGaW5hbCBNb2RlbCBBY2N1cmFjeTogIiwgcm91bmQoYWNjdXJhY3kgKiAxMDAsIDIpLCAiJSIpKQpgYGA=