#Loading Data
wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE)
#Remove id variable
wbcd <- wbcd[-1]
#outcome distribution
table(wbcd$diagnosis)
## 
##   B   M 
## 357 212
#convert to factor and giving more descriptive labelling
wbcd$diagnosis<- factor(wbcd$diagnosis, levels = c("B", "M"), 
                        labels = c("Benign", "Malignant"))
#Distribution proportions
round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1)
## 
##    Benign Malignant 
##      62.7      37.3
summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")])
##   radius_mean       area_mean      smoothness_mean  
##  Min.   : 6.981   Min.   : 143.5   Min.   :0.05263  
##  1st Qu.:11.700   1st Qu.: 420.3   1st Qu.:0.08637  
##  Median :13.370   Median : 551.1   Median :0.09587  
##  Mean   :14.127   Mean   : 654.9   Mean   :0.09636  
##  3rd Qu.:15.780   3rd Qu.: 782.7   3rd Qu.:0.10530  
##  Max.   :28.110   Max.   :2501.0   Max.   :0.16340
#normalizing
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}

wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize))

summary(wbcd_n$area_mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1174  0.1729  0.2169  0.2711  1.0000
wbcd_train <- wbcd_n[1:469, ]
wbcd_test <- wbcd_n[470:569, ]

wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]

library(class)
wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, 
                      cl = wbcd_train_labels, k = 21)

library(gmodels)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, 
           prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                  | wbcd_test_pred 
## wbcd_test_labels |    Benign | Malignant | Row Total | 
## -----------------|-----------|-----------|-----------|
##           Benign |        61 |         0 |        61 | 
##                  |     1.000 |     0.000 |     0.610 | 
##                  |     0.968 |     0.000 |           | 
##                  |     0.610 |     0.000 |           | 
## -----------------|-----------|-----------|-----------|
##        Malignant |         2 |        37 |        39 | 
##                  |     0.051 |     0.949 |     0.390 | 
##                  |     0.032 |     1.000 |           | 
##                  |     0.020 |     0.370 |           | 
## -----------------|-----------|-----------|-----------|
##     Column Total |        63 |        37 |       100 | 
##                  |     0.630 |     0.370 |           | 
## -----------------|-----------|-----------|-----------|
## 
## 
wbcd_z <- as.data.frame(scale(wbcd[-1]))

summary(wbcd_z$area_mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.4532 -0.6666 -0.2949  0.0000  0.3632  5.2459
wbcd_train <- wbcd_z[1:469, ]
wbcd_test <- wbcd_z[470:569, ]
wbcd_train_labels <- wbcd[1:469, 1]
wbcd_test_labels <- wbcd[470:569, 1]

wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, 
                      cl = wbcd_train_labels, k = 21)
CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, 
           prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                  | wbcd_test_pred 
## wbcd_test_labels |    Benign | Malignant | Row Total | 
## -----------------|-----------|-----------|-----------|
##           Benign |        61 |         0 |        61 | 
##                  |     1.000 |     0.000 |     0.610 | 
##                  |     0.924 |     0.000 |           | 
##                  |     0.610 |     0.000 |           | 
## -----------------|-----------|-----------|-----------|
##        Malignant |         5 |        34 |        39 | 
##                  |     0.128 |     0.872 |     0.390 | 
##                  |     0.076 |     1.000 |           | 
##                  |     0.050 |     0.340 |           | 
## -----------------|-----------|-----------|-----------|
##     Column Total |        66 |        34 |       100 | 
##                  |     0.660 |     0.340 |           | 
## -----------------|-----------|-----------|-----------|
## 
##