Classification Algorithm
# Read the dataset
wbcd <- read.csv("E:\\EXCELR ASSIGMENTS\\wbcd.csv")
class(wbcd)
## [1] "data.frame"
View(wbcd)
#First colum in dataset is id which is not required so we will be taking out
wbcd <- wbcd[-1]
View(wbcd)
#table of diagonis B <- 357 and M <- 212
table(wbcd$diagnosis)
##
## B M
## 357 212
# Replace B with Benign and M with Malignant. Diagnosis is factor with 2 levels that is B and M. We also replacing these two entery with Benign and Malignat
wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B","M"), labels = c("Benign","Malignant"))
# table or proportation of enteries in the datasets. What % of entry is Bengin and % of entry is Malignant
round(prop.table(table(wbcd$diagnosis))*100,1)
##
## Benign Malignant
## 62.7 37.3
summary(wbcd[c("radius_mean","texture_mean","perimeter_mean")])
## radius_mean texture_mean perimeter_mean
## Min. : 6.981 Min. : 9.71 Min. : 43.79
## 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
#Create a function to normalize the data
norm <- function(x){
return((x-min(x))/(max(x)-min(x)))
}
#test normalization
norm(c(1,2,3,4,5))
## [1] 0.00 0.25 0.50 0.75 1.00
norm(c(10,20,30,40,50))
## [1] 0.00 0.25 0.50 0.75 1.00
#Apply the normalization function to wbcd dataset
wbcd_n <- as.data.frame(lapply(wbcd[2:31], norm))
View(wbcd_n)
#create training and test datasets
wbcd_train <- wbcd_n[1:469,]
wbcd_test <- wbcd_n[470:569,]
#Get labels for training and test datasets
wbcd_train_labels <- wbcd[1:469,1]
wbcd_test_labels <- wbcd[470:569,1]
# Build a KNN model on taining dataset
library("class")
## Warning: package 'class' was built under R version 3.5.1
library("caret")
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
# Building the KNN model on training dataset and also need labels which we are including c1. Once we build the preduction model
# we have to test on test dataset
wbcd_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=20)
#while we changing the k value we got chnage in error percentage
View(wbcd_pred)
class(wbcd_train)
## [1] "data.frame"
class(wbcd_test)
## [1] "data.frame"
wbcd_pred1 <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=50)
View(wbcd_pred1)
class(wbcd_train)
## [1] "data.frame"
class(wbcd_test)
## [1] "data.frame"
## Now evualuation the model performance
# install package gmodels
#install.packages("gmodels")
library("gmodels")
## Warning: package 'gmodels' was built under R version 3.5.1
# Create cross table of predicted and actual
CrossTable( x = wbcd_test_labels, y = wbcd_pred)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | wbcd_pred
## wbcd_test_labels | Benign | Malignant | Row Total |
## -----------------|-----------|-----------|-----------|
## Benign | 61 | 0 | 61 |
## | 13.255 | 22.570 | |
## | 1.000 | 0.000 | 0.610 |
## | 0.968 | 0.000 | |
## | 0.610 | 0.000 | |
## -----------------|-----------|-----------|-----------|
## Malignant | 2 | 37 | 39 |
## | 20.733 | 35.302 | |
## | 0.051 | 0.949 | 0.390 |
## | 0.032 | 1.000 | |
## | 0.020 | 0.370 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 63 | 37 | 100 |
## | 0.630 | 0.370 | |
## -----------------|-----------|-----------|-----------|
##
##
CrossTable( x = wbcd_test_labels, y = wbcd_pred1)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | wbcd_pred1
## wbcd_test_labels | Benign | Malignant | Row Total |
## -----------------|-----------|-----------|-----------|
## Benign | 61 | 0 | 61 |
## | 11.496 | 21.350 | |
## | 1.000 | 0.000 | 0.610 |
## | 0.938 | 0.000 | |
## | 0.610 | 0.000 | |
## -----------------|-----------|-----------|-----------|
## Malignant | 4 | 35 | 39 |
## | 17.981 | 33.394 | |
## | 0.103 | 0.897 | 0.390 |
## | 0.062 | 1.000 | |
## | 0.040 | 0.350 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 65 | 35 | 100 |
## | 0.650 | 0.350 | |
## -----------------|-----------|-----------|-----------|
##
##