library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrgram)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
library(ggplot2)
library(tidyr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
library(sjPlot)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
df <-read.csv("ProstateCancer.csv",stringsAsFactors = F,
colClasses =c("numeric","character","numeric","numeric","numeric",
"numeric","numeric","numeric","numeric","numeric"))
head(df)
## id diagnosis_result radius texture perimeter area smoothness compactness
## 1 1 M 23 12 151 954 0.143 0.278
## 2 2 B 9 13 133 1326 0.143 0.079
## 3 3 M 21 27 130 1203 0.125 0.160
## 4 4 M 14 16 78 386 0.070 0.284
## 5 5 M 9 19 135 1297 0.141 0.133
## 6 6 B 25 25 83 477 0.128 0.170
## symmetry fractal_dimension
## 1 0.242 0.079
## 2 0.181 0.057
## 3 0.207 0.060
## 4 0.260 0.097
## 5 0.181 0.059
## 6 0.209 0.076
str(df)
## 'data.frame': 100 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ diagnosis_result : chr "M" "B" "M" "M" ...
## $ radius : num 23 9 21 14 9 25 16 15 19 25 ...
## $ texture : num 12 13 27 16 19 25 26 18 24 11 ...
## $ perimeter : num 151 133 130 78 135 83 120 90 88 84 ...
## $ area : num 954 1326 1203 386 1297 ...
## $ smoothness : num 0.143 0.143 0.125 0.07 0.141 0.128 0.095 0.119 0.127 0.119 ...
## $ compactness : num 0.278 0.079 0.16 0.284 0.133 0.17 0.109 0.165 0.193 0.24 ...
## $ symmetry : num 0.242 0.181 0.207 0.26 0.181 0.209 0.179 0.22 0.235 0.203 ...
## $ fractal_dimension: num 0.079 0.057 0.06 0.097 0.059 0.076 0.057 0.075 0.074 0.082 ...
summary(df)
## id diagnosis_result radius texture
## Min. : 1.00 Length:100 Min. : 9.00 Min. :11.00
## 1st Qu.: 25.75 Class :character 1st Qu.:12.00 1st Qu.:14.00
## Median : 50.50 Mode :character Median :17.00 Median :17.50
## Mean : 50.50 Mean :16.85 Mean :18.23
## 3rd Qu.: 75.25 3rd Qu.:21.00 3rd Qu.:22.25
## Max. :100.00 Max. :25.00 Max. :27.00
##
## perimeter area smoothness compactness
## Min. : 52.00 Min. : 202.0 Min. :0.0700 Min. :0.0380
## 1st Qu.: 82.50 1st Qu.: 476.8 1st Qu.:0.0935 1st Qu.:0.0790
## Median : 94.00 Median : 644.0 Median :0.1020 Median :0.1160
## Mean : 96.78 Mean : 702.9 Mean :0.1027 Mean :0.1246
## 3rd Qu.:114.25 3rd Qu.: 917.0 3rd Qu.:0.1120 3rd Qu.:0.1550
## Max. :172.00 Max. :1878.0 Max. :0.1430 Max. :0.3450
## NA's :3
## symmetry fractal_dimension
## Min. :0.1350 Min. :0.05300
## 1st Qu.:0.1720 1st Qu.:0.05900
## Median :0.1900 Median :0.06350
## Mean :0.1932 Mean :0.06482
## 3rd Qu.:0.2090 3rd Qu.:0.06900
## Max. :0.3040 Max. :0.09700
## NA's :2
detectNAs <- function(inp){
sum(is.na(inp))
}
detectSpaces <- function(inp) {
if (class(inp) != "character") {
return ("Non Character Column")
}
sum(trimws(inp)=="")
}
detectOutliers <- function(inp,na.rm = TRUE) {
if (class(inp) != "numeric") {
return ("Non Numeric Column")
}
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=T)
i.max <- 1.5 * IQR(inp, na.rm=T)
otp <- inp
otp[inp < (i.qnt[1] - i.max)] <- NA
otp[inp > (i.qnt[2] + i.max)] <- NA
return(inp[is.na(otp)])
}
ReplaceOutliers <- function(inp,na.rm = TRUE) {
if (class(inp) != "numeric") {
return ("Non Numeric Column")
}
i.qnt <- quantile(inp, probs = c(.25,.75), na.rm = T)
i.range <- 1.5 * IQR(inp, na.rm = T)
inp[inp < (i.qnt[1] - i.range)] <- NA
inp[inp > (i.qnt[2] + i.range)] <- NA
return(inp)
}
lapply(df,detectSpaces)
## $id
## [1] "Non Character Column"
##
## $diagnosis_result
## [1] 0
##
## $radius
## [1] "Non Character Column"
##
## $texture
## [1] "Non Character Column"
##
## $perimeter
## [1] "Non Character Column"
##
## $area
## [1] "Non Character Column"
##
## $smoothness
## [1] "Non Character Column"
##
## $compactness
## [1] "Non Character Column"
##
## $symmetry
## [1] "Non Character Column"
##
## $fractal_dimension
## [1] "Non Character Column"
lapply(df,detectNAs)
## $id
## [1] 0
##
## $diagnosis_result
## [1] 0
##
## $radius
## [1] 0
##
## $texture
## [1] 0
##
## $perimeter
## [1] 0
##
## $area
## [1] 0
##
## $smoothness
## [1] 0
##
## $compactness
## [1] 3
##
## $symmetry
## [1] 0
##
## $fractal_dimension
## [1] 2
lapply(df,detectOutliers)
## $id
## numeric(0)
##
## $diagnosis_result
## [1] "Non Numeric Column"
##
## $radius
## numeric(0)
##
## $texture
## numeric(0)
##
## $perimeter
## [1] 172
##
## $area
## [1] 1878
##
## $smoothness
## [1] 0.143 0.143 0.141
##
## $compactness
## [1] 0.278 0.284 NA NA NA 0.345
##
## $symmetry
## [1] 0.304 0.274 0.291
##
## $fractal_dimension
## [1] 0.097 NA NA 0.090
#lapply(df,ReplaceOutliers)
df$compactness[is.na(df$compactness)] <- mean(df$compactness,na.rm = T)
df$fractal_dimension[is.na(df$fractal_dimension)] <- mean(df$fractal_dimension, na.rm = T)
lapply(df, detectNAs)
## $id
## [1] 0
##
## $diagnosis_result
## [1] 0
##
## $radius
## [1] 0
##
## $texture
## [1] 0
##
## $perimeter
## [1] 0
##
## $area
## [1] 0
##
## $smoothness
## [1] 0
##
## $compactness
## [1] 0
##
## $symmetry
## [1] 0
##
## $fractal_dimension
## [1] 0
Exploratory Data Analsysi
df <- select(df,-id)
head(df)
## diagnosis_result radius texture perimeter area smoothness compactness
## 1 M 23 12 151 954 0.143 0.278
## 2 B 9 13 133 1326 0.143 0.079
## 3 M 21 27 130 1203 0.125 0.160
## 4 M 14 16 78 386 0.070 0.284
## 5 M 9 19 135 1297 0.141 0.133
## 6 B 25 25 83 477 0.128 0.170
## symmetry fractal_dimension
## 1 0.242 0.079
## 2 0.181 0.057
## 3 0.207 0.060
## 4 0.260 0.097
## 5 0.181 0.059
## 6 0.209 0.076
cor(df[2:9])
## radius texture perimeter area smoothness
## radius 1.00000000 0.10024503 -0.2382158 -0.2509343 -0.1271207
## texture 0.10024503 1.00000000 -0.1134528 -0.1137246 0.1023214
## perimeter -0.23821583 -0.11345280 1.0000000 0.9766481 0.2694421
## area -0.25093433 -0.11372456 0.9766481 1.0000000 0.2084380
## smoothness -0.12712075 0.10232140 0.2694421 0.2084380 1.0000000
## compactness -0.19343551 0.01365126 0.5009115 0.4007251 0.4700983
## symmetry -0.03970719 0.07791221 0.1955394 0.1104354 0.4242026
## fractal_dimension -0.02337681 0.13062365 -0.1752467 -0.2500199 0.3672369
## compactness symmetry fractal_dimension
## radius -0.19343551 -0.03970719 -0.02337681
## texture 0.01365126 0.07791221 0.13062365
## perimeter 0.50091154 0.19553938 -0.17524667
## area 0.40072506 0.11043543 -0.25001988
## smoothness 0.47009830 0.42420260 0.36723688
## compactness 1.00000000 0.60992055 0.61844856
## symmetry 0.60992055 1.00000000 0.55775260
## fractal_dimension 0.61844856 0.55775260 1.00000000
corrgram(df[2:9])
Visual data Analysis
ggplot.diagnosis <- ggplot(df, aes(x = diagnosis_result))+
geom_histogram(stat = 'count',aes(fill = diagnosis_result))+
labs(title = "Count of class",x = "diagnosis_result")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(ggplot.diagnosis)
ggplot <- ggplot(df, aes(x = diagnosis_result, y = perimeter))+
geom_boxplot()+
labs(title = "Result according to perimeter",x = "Diagnosis_result", y = "Perimeter")
ggplot
observation People having malignant prostate cancer has higher perimeter than benoign ones
wss function for obtaining optimum value of k
#for optimum value of K, we must give it predictor variables
dft <- select(df, -diagnosis_result)
sjc.elbow(dft)
observation # from graph we can see that k = 12 should be a good number for distance calculation # Let’s check with an KNN algorithm
split datasets
part <- createDataPartition(df$diagnosis_result,p = 0.8,list = FALSE)
dfTrn <- df[part,]
dfTst <- df[-part,]
summary(dfTrn)
## diagnosis_result radius texture perimeter
## Length:81 Min. : 9.00 Min. :11.00 Min. : 52.00
## Class :character 1st Qu.:12.00 1st Qu.:14.00 1st Qu.: 83.00
## Mode :character Median :18.00 Median :18.00 Median : 94.00
## Mean :17.17 Mean :18.33 Mean : 97.47
## 3rd Qu.:21.00 3rd Qu.:23.00 3rd Qu.:120.00
## Max. :25.00 Max. :27.00 Max. :172.00
## area smoothness compactness symmetry
## Min. : 202.0 Min. :0.070 Min. :0.047 Min. :0.135
## 1st Qu.: 477.0 1st Qu.:0.094 1st Qu.:0.082 1st Qu.:0.174
## Median : 584.0 Median :0.101 Median :0.122 Median :0.190
## Mean : 713.9 Mean :0.103 Mean :0.126 Mean :0.193
## 3rd Qu.: 955.0 3rd Qu.:0.112 3rd Qu.:0.154 3rd Qu.:0.208
## Max. :1878.0 Max. :0.143 Max. :0.345 Max. :0.304
## fractal_dimension
## Min. :0.05300
## 1st Qu.:0.05900
## Median :0.06400
## Mean :0.06509
## 3rd Qu.:0.06900
## Max. :0.09700
head(dfTrn)
## diagnosis_result radius texture perimeter area smoothness compactness
## 1 M 23 12 151 954 0.143 0.278
## 2 B 9 13 133 1326 0.143 0.079
## 3 M 21 27 130 1203 0.125 0.160
## 4 M 14 16 78 386 0.070 0.284
## 5 M 9 19 135 1297 0.141 0.133
## 6 B 25 25 83 477 0.128 0.170
## symmetry fractal_dimension
## 1 0.242 0.079
## 2 0.181 0.057
## 3 0.207 0.060
## 4 0.260 0.097
## 5 0.181 0.059
## 6 0.209 0.076
summary(dfTst)
## diagnosis_result radius texture perimeter
## Length:19 Min. :10.00 Min. :11.00 Min. : 60.00
## Class :character 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 81.50
## Mode :character Median :15.00 Median :16.00 Median :100.00
## Mean :15.47 Mean :17.79 Mean : 93.84
## 3rd Qu.:19.50 3rd Qu.:21.50 3rd Qu.:106.00
## Max. :22.00 Max. :27.00 Max. :120.00
## area smoothness compactness symmetry
## Min. : 269.0 Min. :0.0810 Min. :0.0380 Min. :0.1350
## 1st Qu.: 497.5 1st Qu.:0.0920 1st Qu.:0.0750 1st Qu.:0.1720
## Median : 704.0 Median :0.1040 Median :0.1040 Median :0.1900
## Mean : 656.1 Mean :0.1017 Mean :0.1185 Mean :0.1937
## 3rd Qu.: 791.0 3rd Qu.:0.1105 3rd Qu.:0.1525 3rd Qu.:0.2120
## Max. :1033.0 Max. :0.1200 Max. :0.2150 Max. :0.2740
## fractal_dimension
## Min. :0.05300
## 1st Qu.:0.05900
## Median :0.06400
## Mean :0.06363
## 3rd Qu.:0.06850
## Max. :0.07400
head(dfTst)
## diagnosis_result radius texture perimeter area smoothness compactness
## 14 M 12 22 104 783 0.084 0.100
## 17 M 10 16 95 685 0.099 0.072
## 18 M 15 14 108 799 0.117 0.202
## 23 M 20 27 103 704 0.107 0.214
## 29 M 15 15 102 732 0.108 0.170
## 33 M 20 18 113 899 0.120 0.150
## symmetry fractal_dimension
## 14 0.185 0.053
## 17 0.159 0.059
## 18 0.216 0.074
## 23 0.252 0.070
## 29 0.193 0.065
## 33 0.225 0.064
Build Model
set.seed(1)
knnCntrl <- trainControl(method="repeatedcv", number=10, repeats=10)
knnModel <- train(diagnosis_result~.,
data=dfTrn,
method="knn",
trControl=knnCntrl,
preProcess = c("center","scale"),
tuneLength=20)
print(knnModel)
## k-Nearest Neighbors
##
## 81 samples
## 8 predictor
## 2 classes: 'B', 'M'
##
## Pre-processing: centered (8), scaled (8)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 73, 73, 73, 72, 73, 73, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7895833 0.5435682
## 7 0.7825000 0.5253897
## 9 0.7911111 0.5361349
## 11 0.8150000 0.5835796
## 13 0.8063889 0.5672023
## 15 0.7851389 0.5222607
## 17 0.7926389 0.5380497
## 19 0.7926389 0.5404645
## 21 0.7988889 0.5513374
## 23 0.7954167 0.5392092
## 25 0.8101389 0.5669348
## 27 0.8111111 0.5707207
## 29 0.8236111 0.5945609
## 31 0.8162500 0.5802513
## 33 0.8098611 0.5625968
## 35 0.8011111 0.5390663
## 37 0.8047222 0.5431640
## 39 0.8095833 0.5487450
## 41 0.8061111 0.5390545
## 43 0.7831944 0.4763469
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 29.
plot(knnModel)
Predict on Test data
vTst <- predict(knnModel,newdata = dfTst)
vTst
## [1] M M M M M M M B M M B M B B M B M M B
## Levels: B M
check accuracy using confusion matrix
confusionMatrix(vTst,factor(dfTst$diagnosis_result))
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 6 0
## M 1 12
##
## Accuracy : 0.9474
## 95% CI : (0.7397, 0.9987)
## No Information Rate : 0.6316
## P-Value [Acc > NIR] : 0.001951
##
## Kappa : 0.8834
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.8571
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9231
## Prevalence : 0.3684
## Detection Rate : 0.3158
## Detection Prevalence : 0.3158
## Balanced Accuracy : 0.9286
##
## 'Positive' Class : B
##
Train on whole dataset
set.seed(9)
knnCntrl <- trainControl(method = "repeatedcv",repeats = 10)
model <- train(diagnosis_result~.,
data=df,
method="knn",
trControl=knnCntrl,
preProcess = c("center","scale"),
tuneLength=20)
print(model)
## k-Nearest Neighbors
##
## 100 samples
## 8 predictor
## 2 classes: 'B', 'M'
##
## Pre-processing: centered (8), scaled (8)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 90, 91, 89, 90, 90, 90, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8273939 0.6140775
## 7 0.8308182 0.6198946
## 9 0.8335051 0.6252236
## 11 0.8493838 0.6558290
## 13 0.8453636 0.6452023
## 15 0.8455859 0.6445489
## 17 0.8350404 0.6213341
## 19 0.8301313 0.6108426
## 21 0.8257980 0.6001055
## 23 0.8297071 0.6123444
## 25 0.8348384 0.6241381
## 27 0.8337475 0.6210012
## 29 0.8384545 0.6297815
## 31 0.8429293 0.6375419
## 33 0.8526768 0.6587389
## 35 0.8615152 0.6793362
## 37 0.8586061 0.6736620
## 39 0.8463838 0.6441712
## 41 0.8452929 0.6418318
## 43 0.8387879 0.6232671
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 35.
Prediction on new unseen data
new_df <-read.csv("ProstateCancerprd.csv",stringsAsFactors = F)
new_df <- select(new_df, -id)
vPrd <- predict(model,newdata = new_df)
Accuracy Check on unseen data
confusionMatrix(vPrd, factor(new_df$diagnosis_result))
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 1 0
## M 1 2
##
## Accuracy : 0.75
## 95% CI : (0.1941, 0.9937)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.3125
##
## Kappa : 0.5
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.5000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.6667
## Prevalence : 0.5000
## Detection Rate : 0.2500
## Detection Prevalence : 0.2500
## Balanced Accuracy : 0.7500
##
## 'Positive' Class : B
##