Read the dataset
wbcd <- read.csv("D:\\DataScience\\DATA\\ExcelR\\KNN\\KNN Dataset\\wbcd.csv")
class(wbcd)
## [1] "data.frame"
View(wbcd)
#First colum in dataset is id which is not required so we will be taking out
wbcd <- wbcd[-1]
View(wbcd)
#table of diagonis B <- 357 and M <- 212
table(wbcd$diagnosis)
##
## B M
## 357 212
# Replace B with Benign and M with Malignant. Diagnosis is factor with 2 levels that is B and M. We also replacing these two entery with Benign and Malignat
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B","M"), labels = c("Benign","Malignant"))
# table or proportation of enteries in the datasets. What % of entry is Bengin and % of entry is Malignant
round(prop.table(table(wbcd$diagnosis))*100,1)
##
## Benign Malignant
## 62.7 37.3
summary(wbcd[c("radius_mean","texture_mean","perimeter_mean")])
## radius_mean texture_mean perimeter_mean
## Min. : 6.981 Min. : 9.71 Min. : 43.79
## 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
#Create a function to normalize the data
norm <- function(x){
return((x-min(x))/(max(x)-min(x)))
}
#Apply the normalization function to wbcd dataset
wbcd_n <- as.data.frame(lapply(wbcd[2:31], norm))
View(wbcd_n)
#create training and test datasets
wbcd_train <- wbcd_n[1:469,]
wbcd_test <- wbcd_n[470:569,]
#Get labels for training and test datasets
wbcd_train_labels <- wbcd[1:469,1]
wbcd_test_labels <- wbcd[470:569,1]
# Build a KNN model on taining dataset
library("class")
library("caret")
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.5.1
## Loading required package: ggplot2
# Building the KNN model on training dataset and also need labels which we are including c1. Once we build the preduction model
# we have to test on test dataset
wbcd_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=18)
class(wbcd_train)
## [1] "data.frame"
class(wbcd_test)
## [1] "data.frame"
x <- as.data.frame(wbcd_pred)
x
## wbcd_pred
## 1 Benign
## 2 Benign
## 3 Benign
## 4 Benign
## 5 Malignant
## 6 Benign
## 7 Malignant
## 8 Benign
## 9 Malignant
## 10 Benign
## 11 Malignant
## 12 Benign
## 13 Malignant
## 14 Malignant
## 15 Benign
## 16 Benign
## 17 Malignant
## 18 Benign
## 19 Malignant
## 20 Benign
## 21 Malignant
## 22 Malignant
## 23 Malignant
## 24 Malignant
## 25 Benign
## 26 Benign
## 27 Benign
## 28 Benign
## 29 Malignant
## 30 Malignant
## 31 Malignant
## 32 Benign
## 33 Benign
## 34 Malignant
## 35 Benign
## 36 Benign
## 37 Benign
## 38 Benign
## 39 Benign
## 40 Malignant
## 41 Malignant
## 42 Benign
## 43 Malignant
## 44 Malignant
## 45 Benign
## 46 Malignant
## 47 Malignant
## 48 Malignant
## 49 Malignant
## 50 Malignant
## 51 Benign
## 52 Benign
## 53 Benign
## 54 Benign
## 55 Benign
## 56 Benign
## 57 Benign
## 58 Benign
## 59 Malignant
## 60 Benign
## 61 Benign
## 62 Benign
## 63 Benign
## 64 Benign
## 65 Malignant
## 66 Malignant
## 67 Benign
## 68 Benign
## 69 Benign
## 70 Benign
## 71 Benign
## 72 Malignant
## 73 Benign
## 74 Benign
## 75 Malignant
## 76 Malignant
## 77 Benign
## 78 Benign
## 79 Benign
## 80 Benign
## 81 Benign
## 82 Benign
## 83 Benign
## 84 Malignant
## 85 Benign
## 86 Benign
## 87 Malignant
## 88 Benign
## 89 Benign
## 90 Benign
## 91 Benign
## 92 Malignant
## 93 Benign
## 94 Benign
## 95 Benign
## 96 Benign
## 97 Benign
## 98 Malignant
## 99 Benign
## 100 Malignant
x <- cbind(x,wbcd_test_labels)
## Now evualuation the model performance
#install.packages("gmodels")
library("gmodels")
## Warning: package 'gmodels' was built under R version 3.5.1
# Create cross table of predicted and actual
CrossTable( x = wbcd_test_labels, y = wbcd_pred)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | wbcd_pred
## wbcd_test_labels | Benign | Malignant | Row Total |
## -----------------|-----------|-----------|-----------|
## Benign | 61 | 0 | 61 |
## | 12.353 | 21.960 | |
## | 1.000 | 0.000 | 0.610 |
## | 0.953 | 0.000 | |
## | 0.610 | 0.000 | |
## -----------------|-----------|-----------|-----------|
## Malignant | 3 | 36 | 39 |
## | 19.321 | 34.348 | |
## | 0.077 | 0.923 | 0.390 |
## | 0.047 | 1.000 | |
## | 0.030 | 0.360 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 64 | 36 | 100 |
## | 0.640 | 0.360 | |
## -----------------|-----------|-----------|-----------|
##
##