Assignment 5

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corrgram)

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus

library(ggplot2)
library(tidyr)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:corrgram':
## 
##     panel.fill

library(sjPlot)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

df <-read.csv("ProstateCancer.csv",stringsAsFactors = F,
              colClasses =c("numeric","character","numeric","numeric","numeric",
                            "numeric","numeric","numeric","numeric","numeric"))
head(df)

##   id diagnosis_result radius texture perimeter area smoothness compactness
## 1  1                M     23      12       151  954      0.143       0.278
## 2  2                B      9      13       133 1326      0.143       0.079
## 3  3                M     21      27       130 1203      0.125       0.160
## 4  4                M     14      16        78  386      0.070       0.284
## 5  5                M      9      19       135 1297      0.141       0.133
## 6  6                B     25      25        83  477      0.128       0.170
##   symmetry fractal_dimension
## 1    0.242             0.079
## 2    0.181             0.057
## 3    0.207             0.060
## 4    0.260             0.097
## 5    0.181             0.059
## 6    0.209             0.076

str(df)

## 'data.frame':    100 obs. of  10 variables:
##  $ id               : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ diagnosis_result : chr  "M" "B" "M" "M" ...
##  $ radius           : num  23 9 21 14 9 25 16 15 19 25 ...
##  $ texture          : num  12 13 27 16 19 25 26 18 24 11 ...
##  $ perimeter        : num  151 133 130 78 135 83 120 90 88 84 ...
##  $ area             : num  954 1326 1203 386 1297 ...
##  $ smoothness       : num  0.143 0.143 0.125 0.07 0.141 0.128 0.095 0.119 0.127 0.119 ...
##  $ compactness      : num  0.278 0.079 0.16 0.284 0.133 0.17 0.109 0.165 0.193 0.24 ...
##  $ symmetry         : num  0.242 0.181 0.207 0.26 0.181 0.209 0.179 0.22 0.235 0.203 ...
##  $ fractal_dimension: num  0.079 0.057 0.06 0.097 0.059 0.076 0.057 0.075 0.074 0.082 ...

summary(df)

##        id         diagnosis_result       radius         texture     
##  Min.   :  1.00   Length:100         Min.   : 9.00   Min.   :11.00  
##  1st Qu.: 25.75   Class :character   1st Qu.:12.00   1st Qu.:14.00  
##  Median : 50.50   Mode  :character   Median :17.00   Median :17.50  
##  Mean   : 50.50                      Mean   :16.85   Mean   :18.23  
##  3rd Qu.: 75.25                      3rd Qu.:21.00   3rd Qu.:22.25  
##  Max.   :100.00                      Max.   :25.00   Max.   :27.00  
##                                                                     
##    perimeter           area          smoothness      compactness    
##  Min.   : 52.00   Min.   : 202.0   Min.   :0.0700   Min.   :0.0380  
##  1st Qu.: 82.50   1st Qu.: 476.8   1st Qu.:0.0935   1st Qu.:0.0790  
##  Median : 94.00   Median : 644.0   Median :0.1020   Median :0.1160  
##  Mean   : 96.78   Mean   : 702.9   Mean   :0.1027   Mean   :0.1246  
##  3rd Qu.:114.25   3rd Qu.: 917.0   3rd Qu.:0.1120   3rd Qu.:0.1550  
##  Max.   :172.00   Max.   :1878.0   Max.   :0.1430   Max.   :0.3450  
##                                                     NA's   :3       
##     symmetry      fractal_dimension
##  Min.   :0.1350   Min.   :0.05300  
##  1st Qu.:0.1720   1st Qu.:0.05900  
##  Median :0.1900   Median :0.06350  
##  Mean   :0.1932   Mean   :0.06482  
##  3rd Qu.:0.2090   3rd Qu.:0.06900  
##  Max.   :0.3040   Max.   :0.09700  
##                   NA's   :2

detectNAs <- function(inp){
  sum(is.na(inp))
}

detectSpaces <- function(inp) {
    if (class(inp) != "character") {
        return ("Non Character Column")
    }
    sum(trimws(inp)=="")
}

detectOutliers <- function(inp,na.rm = TRUE) {
  if (class(inp) != "numeric") {
        return ("Non Numeric Column")
  }
  i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=T)
  i.max <- 1.5 * IQR(inp, na.rm=T)
  otp <- inp
  otp[inp < (i.qnt[1] - i.max)] <- NA
  otp[inp > (i.qnt[2] + i.max)] <- NA
  return(inp[is.na(otp)])
}

ReplaceOutliers <- function(inp,na.rm = TRUE) {
  if (class(inp) != "numeric") {
        return ("Non Numeric Column")
  }
  i.qnt <- quantile(inp, probs = c(.25,.75), na.rm = T)
  i.range <- 1.5 * IQR(inp, na.rm = T)
  inp[inp < (i.qnt[1] - i.range)] <- NA
  inp[inp > (i.qnt[2] + i.range)] <- NA
  return(inp)
}

lapply(df,detectSpaces)

## $id
## [1] "Non Character Column"
## 
## $diagnosis_result
## [1] 0
## 
## $radius
## [1] "Non Character Column"
## 
## $texture
## [1] "Non Character Column"
## 
## $perimeter
## [1] "Non Character Column"
## 
## $area
## [1] "Non Character Column"
## 
## $smoothness
## [1] "Non Character Column"
## 
## $compactness
## [1] "Non Character Column"
## 
## $symmetry
## [1] "Non Character Column"
## 
## $fractal_dimension
## [1] "Non Character Column"

lapply(df,detectNAs)

## $id
## [1] 0
## 
## $diagnosis_result
## [1] 0
## 
## $radius
## [1] 0
## 
## $texture
## [1] 0
## 
## $perimeter
## [1] 0
## 
## $area
## [1] 0
## 
## $smoothness
## [1] 0
## 
## $compactness
## [1] 3
## 
## $symmetry
## [1] 0
## 
## $fractal_dimension
## [1] 2

lapply(df,detectOutliers)

## $id
## numeric(0)
## 
## $diagnosis_result
## [1] "Non Numeric Column"
## 
## $radius
## numeric(0)
## 
## $texture
## numeric(0)
## 
## $perimeter
## [1] 172
## 
## $area
## [1] 1878
## 
## $smoothness
## [1] 0.143 0.143 0.141
## 
## $compactness
## [1] 0.278 0.284    NA    NA    NA 0.345
## 
## $symmetry
## [1] 0.304 0.274 0.291
## 
## $fractal_dimension
## [1] 0.097    NA    NA 0.090

#lapply(df,ReplaceOutliers)

df$compactness[is.na(df$compactness)] <- mean(df$compactness,na.rm = T)
df$fractal_dimension[is.na(df$fractal_dimension)] <- mean(df$fractal_dimension, na.rm = T)

lapply(df, detectNAs)

## $id
## [1] 0
## 
## $diagnosis_result
## [1] 0
## 
## $radius
## [1] 0
## 
## $texture
## [1] 0
## 
## $perimeter
## [1] 0
## 
## $area
## [1] 0
## 
## $smoothness
## [1] 0
## 
## $compactness
## [1] 0
## 
## $symmetry
## [1] 0
## 
## $fractal_dimension
## [1] 0

Exploratory Data Analsysi

df <- select(df,-id)
head(df)

##   diagnosis_result radius texture perimeter area smoothness compactness
## 1                M     23      12       151  954      0.143       0.278
## 2                B      9      13       133 1326      0.143       0.079
## 3                M     21      27       130 1203      0.125       0.160
## 4                M     14      16        78  386      0.070       0.284
## 5                M      9      19       135 1297      0.141       0.133
## 6                B     25      25        83  477      0.128       0.170
##   symmetry fractal_dimension
## 1    0.242             0.079
## 2    0.181             0.057
## 3    0.207             0.060
## 4    0.260             0.097
## 5    0.181             0.059
## 6    0.209             0.076

cor(df[2:9])

##                        radius     texture  perimeter       area smoothness
## radius             1.00000000  0.10024503 -0.2382158 -0.2509343 -0.1271207
## texture            0.10024503  1.00000000 -0.1134528 -0.1137246  0.1023214
## perimeter         -0.23821583 -0.11345280  1.0000000  0.9766481  0.2694421
## area              -0.25093433 -0.11372456  0.9766481  1.0000000  0.2084380
## smoothness        -0.12712075  0.10232140  0.2694421  0.2084380  1.0000000
## compactness       -0.19343551  0.01365126  0.5009115  0.4007251  0.4700983
## symmetry          -0.03970719  0.07791221  0.1955394  0.1104354  0.4242026
## fractal_dimension -0.02337681  0.13062365 -0.1752467 -0.2500199  0.3672369
##                   compactness    symmetry fractal_dimension
## radius            -0.19343551 -0.03970719       -0.02337681
## texture            0.01365126  0.07791221        0.13062365
## perimeter          0.50091154  0.19553938       -0.17524667
## area               0.40072506  0.11043543       -0.25001988
## smoothness         0.47009830  0.42420260        0.36723688
## compactness        1.00000000  0.60992055        0.61844856
## symmetry           0.60992055  1.00000000        0.55775260
## fractal_dimension  0.61844856  0.55775260        1.00000000

corrgram(df[2:9])

Visual data Analysis

ggplot.diagnosis <- ggplot(df, aes(x = diagnosis_result))+
                    geom_histogram(stat = 'count',aes(fill = diagnosis_result))+
                    labs(title = "Count of class",x = "diagnosis_result")

## Warning: Ignoring unknown parameters: binwidth, bins, pad

ggplotly(ggplot.diagnosis)

ggplot <- ggplot(df, aes(x = diagnosis_result, y = perimeter))+
          geom_boxplot()+
          labs(title = "Result according to perimeter",x = "Diagnosis_result", y = "Perimeter")
ggplot

observation People having malignant prostate cancer has higher perimeter than benoign ones

wss function for obtaining optimum value of k

#for optimum value of K, we must give it predictor variables
dft <- select(df, -diagnosis_result)
sjc.elbow(dft)

observation # from graph we can see that k = 12 should be a good number for distance calculation # Let’s check with an KNN algorithm

split datasets

part <- createDataPartition(df$diagnosis_result,p = 0.8,list = FALSE)
dfTrn <- df[part,]
dfTst <- df[-part,]

summary(dfTrn)

##  diagnosis_result       radius         texture        perimeter     
##  Length:81          Min.   : 9.00   Min.   :11.00   Min.   : 52.00  
##  Class :character   1st Qu.:12.00   1st Qu.:14.00   1st Qu.: 83.00  
##  Mode  :character   Median :18.00   Median :18.00   Median : 94.00  
##                     Mean   :17.17   Mean   :18.33   Mean   : 97.47  
##                     3rd Qu.:21.00   3rd Qu.:23.00   3rd Qu.:120.00  
##                     Max.   :25.00   Max.   :27.00   Max.   :172.00  
##       area          smoothness     compactness       symmetry    
##  Min.   : 202.0   Min.   :0.070   Min.   :0.047   Min.   :0.135  
##  1st Qu.: 477.0   1st Qu.:0.094   1st Qu.:0.082   1st Qu.:0.174  
##  Median : 584.0   Median :0.101   Median :0.122   Median :0.190  
##  Mean   : 713.9   Mean   :0.103   Mean   :0.126   Mean   :0.193  
##  3rd Qu.: 955.0   3rd Qu.:0.112   3rd Qu.:0.154   3rd Qu.:0.208  
##  Max.   :1878.0   Max.   :0.143   Max.   :0.345   Max.   :0.304  
##  fractal_dimension
##  Min.   :0.05300  
##  1st Qu.:0.05900  
##  Median :0.06400  
##  Mean   :0.06509  
##  3rd Qu.:0.06900  
##  Max.   :0.09700

head(dfTrn)

##   diagnosis_result radius texture perimeter area smoothness compactness
## 1                M     23      12       151  954      0.143       0.278
## 2                B      9      13       133 1326      0.143       0.079
## 3                M     21      27       130 1203      0.125       0.160
## 4                M     14      16        78  386      0.070       0.284
## 5                M      9      19       135 1297      0.141       0.133
## 6                B     25      25        83  477      0.128       0.170
##   symmetry fractal_dimension
## 1    0.242             0.079
## 2    0.181             0.057
## 3    0.207             0.060
## 4    0.260             0.097
## 5    0.181             0.059
## 6    0.209             0.076

summary(dfTst)

##  diagnosis_result       radius         texture        perimeter     
##  Length:19          Min.   :10.00   Min.   :11.00   Min.   : 60.00  
##  Class :character   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 81.50  
##  Mode  :character   Median :15.00   Median :16.00   Median :100.00  
##                     Mean   :15.47   Mean   :17.79   Mean   : 93.84  
##                     3rd Qu.:19.50   3rd Qu.:21.50   3rd Qu.:106.00  
##                     Max.   :22.00   Max.   :27.00   Max.   :120.00  
##       area          smoothness      compactness        symmetry     
##  Min.   : 269.0   Min.   :0.0810   Min.   :0.0380   Min.   :0.1350  
##  1st Qu.: 497.5   1st Qu.:0.0920   1st Qu.:0.0750   1st Qu.:0.1720  
##  Median : 704.0   Median :0.1040   Median :0.1040   Median :0.1900  
##  Mean   : 656.1   Mean   :0.1017   Mean   :0.1185   Mean   :0.1937  
##  3rd Qu.: 791.0   3rd Qu.:0.1105   3rd Qu.:0.1525   3rd Qu.:0.2120  
##  Max.   :1033.0   Max.   :0.1200   Max.   :0.2150   Max.   :0.2740  
##  fractal_dimension
##  Min.   :0.05300  
##  1st Qu.:0.05900  
##  Median :0.06400  
##  Mean   :0.06363  
##  3rd Qu.:0.06850  
##  Max.   :0.07400

head(dfTst)

##    diagnosis_result radius texture perimeter area smoothness compactness
## 14                M     12      22       104  783      0.084       0.100
## 17                M     10      16        95  685      0.099       0.072
## 18                M     15      14       108  799      0.117       0.202
## 23                M     20      27       103  704      0.107       0.214
## 29                M     15      15       102  732      0.108       0.170
## 33                M     20      18       113  899      0.120       0.150
##    symmetry fractal_dimension
## 14    0.185             0.053
## 17    0.159             0.059
## 18    0.216             0.074
## 23    0.252             0.070
## 29    0.193             0.065
## 33    0.225             0.064

Build Model

set.seed(1)
knnCntrl <- trainControl(method="repeatedcv", number=10, repeats=10)
knnModel <- train(diagnosis_result~., 
                          data=dfTrn, 
                          method="knn", 
                          trControl=knnCntrl,
                    preProcess = c("center","scale"), 
                    tuneLength=20)
print(knnModel)

## k-Nearest Neighbors 
## 
## 81 samples
##  8 predictor
##  2 classes: 'B', 'M' 
## 
## Pre-processing: centered (8), scaled (8) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 73, 73, 73, 72, 73, 73, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.7895833  0.5435682
##    7  0.7825000  0.5253897
##    9  0.7911111  0.5361349
##   11  0.8150000  0.5835796
##   13  0.8063889  0.5672023
##   15  0.7851389  0.5222607
##   17  0.7926389  0.5380497
##   19  0.7926389  0.5404645
##   21  0.7988889  0.5513374
##   23  0.7954167  0.5392092
##   25  0.8101389  0.5669348
##   27  0.8111111  0.5707207
##   29  0.8236111  0.5945609
##   31  0.8162500  0.5802513
##   33  0.8098611  0.5625968
##   35  0.8011111  0.5390663
##   37  0.8047222  0.5431640
##   39  0.8095833  0.5487450
##   41  0.8061111  0.5390545
##   43  0.7831944  0.4763469
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 29.

plot(knnModel)

Predict on Test data

vTst <- predict(knnModel,newdata = dfTst)
vTst

##  [1] M M M M M M M B M M B M B B M B M M B
## Levels: B M

check accuracy using confusion matrix

confusionMatrix(vTst,factor(dfTst$diagnosis_result))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B  6  0
##          M  1 12
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.7397, 0.9987)
##     No Information Rate : 0.6316          
##     P-Value [Acc > NIR] : 0.001951        
##                                           
##                   Kappa : 0.8834          
##                                           
##  Mcnemar's Test P-Value : 1.000000        
##                                           
##             Sensitivity : 0.8571          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9231          
##              Prevalence : 0.3684          
##          Detection Rate : 0.3158          
##    Detection Prevalence : 0.3158          
##       Balanced Accuracy : 0.9286          
##                                           
##        'Positive' Class : B               
##

Train on whole dataset

set.seed(9)
knnCntrl <- trainControl(method = "repeatedcv",repeats = 10)
model <- train(diagnosis_result~., 
                       data=df, 
                       method="knn", 
                       trControl=knnCntrl,
                 preProcess = c("center","scale"), 
                 tuneLength=20)
print(model)

## k-Nearest Neighbors 
## 
## 100 samples
##   8 predictor
##   2 classes: 'B', 'M' 
## 
## Pre-processing: centered (8), scaled (8) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 90, 91, 89, 90, 90, 90, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.8273939  0.6140775
##    7  0.8308182  0.6198946
##    9  0.8335051  0.6252236
##   11  0.8493838  0.6558290
##   13  0.8453636  0.6452023
##   15  0.8455859  0.6445489
##   17  0.8350404  0.6213341
##   19  0.8301313  0.6108426
##   21  0.8257980  0.6001055
##   23  0.8297071  0.6123444
##   25  0.8348384  0.6241381
##   27  0.8337475  0.6210012
##   29  0.8384545  0.6297815
##   31  0.8429293  0.6375419
##   33  0.8526768  0.6587389
##   35  0.8615152  0.6793362
##   37  0.8586061  0.6736620
##   39  0.8463838  0.6441712
##   41  0.8452929  0.6418318
##   43  0.8387879  0.6232671
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 35.

Prediction on new unseen data

new_df <-read.csv("ProstateCancerprd.csv",stringsAsFactors = F)
new_df <- select(new_df, -id)

vPrd <- predict(model,newdata = new_df)

Accuracy Check on unseen data

confusionMatrix(vPrd, factor(new_df$diagnosis_result))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction B M
##          B 1 0
##          M 1 2
##                                           
##                Accuracy : 0.75            
##                  95% CI : (0.1941, 0.9937)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.3125          
##                                           
##                   Kappa : 0.5             
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5000          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2500          
##    Detection Prevalence : 0.2500          
##       Balanced Accuracy : 0.7500          
##                                           
##        'Positive' Class : B               
##

Assignment 5

Rohan Khollamkar

01/05/2019