breastCancer.R

library(ggbiplot)

## Loading required package: ggplot2

## Loading required package: plyr

## Loading required package: scales

## Loading required package: grid

library(tidyverse)

## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ tibble  2.0.1     ✔ purrr   0.2.5
## ✔ tidyr   0.8.2     ✔ dplyr   0.7.8
## ✔ readr   1.3.1     ✔ stringr 1.3.1
## ✔ tibble  2.0.1     ✔ forcats 0.3.0

## Warning: package 'tibble' was built under R version 3.5.2

## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()    masks plyr::arrange()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::compact()    masks plyr::compact()
## ✖ dplyr::count()      masks plyr::count()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::failwith()   masks plyr::failwith()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::id()         masks plyr::id()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::mutate()     masks plyr::mutate()
## ✖ dplyr::rename()     masks plyr::rename()
## ✖ dplyr::summarise()  masks plyr::summarise()
## ✖ dplyr::summarize()  masks plyr::summarize()

library(scales)

## load data and name columns properly
breastCancer <-
  read.csv("~/Desktop/CUproject/breast-cancer-wisconsin.csv",
           na.strings = "?")
colnames(breastCancer) <-
  c(
    "id",
    "clumpThickness",
    "uniformityCellSize",
    "uniformityCellShape",
    "marginalAdhesion",
    "singleEpithelCellSize",
    "bareNuclei",
    "blandChromatin",
    "normalNucleoli",
    "mitoses",
    "class"
  )

str(breastCancer)

## 'data.frame':    698 obs. of  11 variables:
##  $ id                   : int  1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 1035283 ...
##  $ clumpThickness       : int  5 3 6 4 8 1 2 2 4 1 ...
##  $ uniformityCellSize   : int  4 1 8 1 10 1 1 1 2 1 ...
##  $ uniformityCellShape  : int  4 1 8 1 10 1 2 1 1 1 ...
##  $ marginalAdhesion     : int  5 1 1 3 8 1 1 1 1 1 ...
##  $ singleEpithelCellSize: int  7 2 3 2 7 2 2 2 2 1 ...
##  $ bareNuclei           : int  10 2 4 1 10 10 1 1 1 1 ...
##  $ blandChromatin       : int  3 3 3 3 9 3 3 1 2 3 ...
##  $ normalNucleoli       : int  2 1 7 1 7 1 1 1 1 1 ...
##  $ mitoses              : int  1 1 1 1 1 1 1 5 1 1 ...
##  $ class                : int  2 2 2 2 4 2 2 2 2 2 ...

## remove id
breastCancer$id <- NULL
## see summary
summary(breastCancer)

##  clumpThickness   uniformityCellSize uniformityCellShape marginalAdhesion
##  Min.   : 1.000   Min.   : 1.000     Min.   : 1.000      Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000     1st Qu.: 1.000      1st Qu.: 1.000  
##  Median : 4.000   Median : 1.000     Median : 1.000      Median : 1.000  
##  Mean   : 4.417   Mean   : 3.138     Mean   : 3.211      Mean   : 2.809  
##  3rd Qu.: 6.000   3rd Qu.: 5.000     3rd Qu.: 5.000      3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000     Max.   :10.000      Max.   :10.000  
##                                                                          
##  singleEpithelCellSize   bareNuclei     blandChromatin   normalNucleoli 
##  Min.   : 1.000        Min.   : 1.000   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 2.000        1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.00  
##  Median : 2.000        Median : 1.000   Median : 3.000   Median : 1.00  
##  Mean   : 3.218        Mean   : 3.548   Mean   : 3.438   Mean   : 2.87  
##  3rd Qu.: 4.000        3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 4.00  
##  Max.   :10.000        Max.   :10.000   Max.   :10.000   Max.   :10.00  
##                        NA's   :16                                       
##     mitoses          class      
##  Min.   : 1.00   Min.   :2.000  
##  1st Qu.: 1.00   1st Qu.:2.000  
##  Median : 1.00   Median :2.000  
##  Mean   : 1.59   Mean   :2.691  
##  3rd Qu.: 1.00   3rd Qu.:4.000  
##  Max.   :10.00   Max.   :4.000  
##

" only bare nuclei has 16 NAs "

## [1] " only bare nuclei has 16 NAs "

########## EDA #########
breastCancer$class <- as.factor(breastCancer$class)
levels(breastCancer$class)

## [1] "2" "4"

levels(breastCancer$class) <- c("benign", "malignant")

### number of benign and malignant tumors
total <- nrow(breastCancer)                                   # 698
benign <- length(which(breastCancer$class == "benign"))       # 457
malignant <- length(which(breastCancer$class == "malignant")) # 241
benign / total * 100

## [1] 65.47278

malignant / total * 100

## [1] 34.52722

" So, out of 689 rows, 457 i.e. 65.47% are benign and 241 i.e. 34.53% is malignant "

## [1] " So, out of 689 rows, 457 i.e. 65.47% are benign and 241 i.e. 34.53% is malignant "

" though the data is little off-balancced it is workable "

## [1] " though the data is little off-balancced it is workable "

##### univariate analysis #####
univariate <- function(dt, variable, var_name) {
  ggplot(dt, aes(fct_infreq(variable))) +
    geom_bar(aes(y = (..count..) / sum(..count..)), fill = "coral") +
    geom_text(aes(y = ((..count..) / sum(..count..)), label = scales::percent((..count..) /
                                                                                sum(..count..))),
              stat = "count",
              vjust = -0.15) +
    scale_y_continuous(labels = percent) +
    labs(title = var_name, x = var_name, y = "Percent") +
    theme(
      axis.text.y = element_blank(),
      axis.ticks = element_blank(),
      axis.title.y = element_blank(),
      axis.text.x = element_text()
    )
}

### class
univariate(breastCancer, breastCancer$class, "Rate of Malignant")

" 34.5% malignant, 65.5% benign "

## [1] " 34.5% malignant, 65.5% benign "

### clump thikness
univariate(breastCancer,
           as.factor(breastCancer$clumpThickness),
           "Clump Thickness")

" highest 1 and lowest 9"

## [1] " highest 1 and lowest 9"

### Uniformity of Cell Size
univariate(breastCancer,
           as.factor(breastCancer$uniformityCellSize),
           "Uniformity of Cell Size ")

" 1 has very high value (54.9%) compared to others, so it could be treated as outlier "

## [1] " 1 has very high value (54.9%) compared to others, so it could be treated as outlier "

### Uniformity of Cell Shape
univariate(
  breastCancer,
  as.factor(breastCancer$uniformityCellShape),
  "Uniformity of Cell Shape "
)

" 1 has very high value (50.4%)n compared to others, could possibly be outlier "

## [1] " 1 has very high value (50.4%)n compared to others, could possibly be outlier "

### Marginal Adhesion
univariate(breastCancer,
           as.factor(breastCancer$marginalAdhesion),
           "Marginal Adhesion")

" 1 has most common value (58.2%) "

## [1] " 1 has most common value (58.2%) "

### Single Epithelial Cell Size
univariate(
  breastCancer,
  as.factor(breastCancer$singleEpithelCellSize),
  "Single Epithelial Cell Size"
)

" 2 has highest value (55.2%) "

## [1] " 2 has highest value (55.2%) "

### Bare Nuclei
univariate(breastCancer,
           as.factor(breastCancer$bareNuclei),
           "Bare Nuclei")

" 1 has highest value(57.4%), 2.3% NA"

## [1] " 1 has highest value(57.4%), 2.3% NA"

### Bland Chromatin
univariate(breastCancer,
           as.factor(breastCancer$blandChromatin),
           "Bland Chromatin")

" 2,3,1 has more or less same values, more evenly distributed than other variables "

## [1] " 2,3,1 has more or less same values, more evenly distributed than other variables "

### Normal Nucleoli
univariate(breastCancer,
           as.factor(breastCancer$normalNucleoli),
           "Normal Nucleoli")

" 1 has 63.3% values "

## [1] " 1 has 63.3% values "

### Mitoses
univariate(breastCancer, as.factor(breastCancer$mitoses), "Mitoses")

" 1 has 82.8% values "

## [1] " 1 has 82.8% values "

#### box plot of the variables ####
### wrt clump thikness
ggplot(breastCancer, aes(x = class, y = clumpThickness)) + geom_boxplot(aes(fill = class))

" there is a clear relation between higher clump thickness and malignancy "

## [1] " there is a clear relation between higher clump thickness and malignancy "

### wrt Uniformity of Cell Size
ggplot(breastCancer, aes(x = class, y = uniformityCellSize)) + geom_boxplot(aes(fill = class))

" except some outliers, higher uniformioty of cell size is linked with higher chane of malignancy "

## [1] " except some outliers, higher uniformioty of cell size is linked with higher chane of malignancy "

### wrt Uniformity of Cell Shape
ggplot(breastCancer, aes(x = class, y = uniformityCellShape)) + geom_boxplot(aes(fill = class))

" uniformity of cell shape has very identical result with uniformity cell size "

## [1] " uniformity of cell shape has very identical result with uniformity cell size "

cor(breastCancer$uniformityCellSize,
    breastCancer$uniformityCellShape)

## [1] 0.9068137

" .91 correlation; highly correlated "

## [1] " .91 correlation; highly correlated "

### wrt Marginal Adhesion
ggplot(breastCancer, aes(x = class, y = marginalAdhesion)) + geom_boxplot(aes(fill = class))

" higher marginal adhesion is linked with higher malignancy "

## [1] " higher marginal adhesion is linked with higher malignancy "

### wrt Single Epithelial Cell Size
ggplot(breastCancer, aes(x = class, y = singleEpithelCellSize)) + geom_boxplot(aes(fill = class))

" Here too with higher single epithelial cell size chances of mal;ignancy increases "

## [1] " Here too with higher single epithelial cell size chances of mal;ignancy increases "

### wrt Bare Nuclei
ggplot(breastCancer, aes(x = class, y = bareNuclei)) + geom_boxplot(aes(fill = class))

## Warning: Removed 16 rows containing non-finite values (stat_boxplot).

" there is a very high correlation between bare nuclei and malignancy "

## [1] " there is a very high correlation between bare nuclei and malignancy "

### wrt Bland Chromatin
ggplot(breastCancer, aes(x = class, y = blandChromatin)) + geom_boxplot(aes(fill = class))

" higher bland chromatin correlates to higher malignancy "

## [1] " higher bland chromatin correlates to higher malignancy "

### wrt Normal Nucleoli
ggplot(breastCancer, aes(x = class, y = normalNucleoli)) + geom_boxplot(aes(fill = class))

" higer value correlates to higher chance of malignancy "

## [1] " higer value correlates to higher chance of malignancy "

### wrt Mitoses
ggplot(breastCancer, aes(x = class, y = mitoses)) + geom_boxplot(aes(fill = class))

" here there isn't much significant difference; butr malignants tend to have more tendency of mitoses"

## [1] " here there isn't much significant difference; butr malignants tend to have more tendency of mitoses"

##### bivariate analysis #####

### correlation plot
M <- cor(breastCancer[-ncol(breastCancer)] %>% drop_na())
col1 <-
  colorRampPalette(
    c(
      "#7F0000",
      "red",
      "#FF7F00",
      "yellow",
      "white",
      "cyan",
      "#007FFF",
      "blue",
      "#00007F"
    )
  )
library(corrplot)

## corrplot 0.84 loaded

corrplot(
  M,
  method = "color",
  cl.lim = c(-1, 1),
  col = col1(100),
  addgrid.col = "black"
)

" all of the columns have positive correlation "

## [1] " all of the columns have positive correlation "

## uniformity cell shape vs uniformity cell size
ggplot(breastCancer) + geom_boxplot(aes(x = uniformityCellSize, y = uniformityCellShape, fill = class)) +
  geom_line(
    data = (
      breastCancer %>% group_by(uniformityCellSize) %>%
        summarize(avg = mean(uniformityCellShape))
    ),
    aes(x = uniformityCellSize, y = avg, group = 1)
  )

" Though there is mostly linear trend, there are some fluctuation, so it it better to keep both of them "

## [1] " Though there is mostly linear trend, there are some fluctuation, so it it better to keep both of them "

### scatter plot
## uniformity of cell size and shape
ggplot(breastCancer) + geom_point(aes(x = uniformityCellSize, y = uniformityCellShape, col = class))

" data points are linearly seperable "

## [1] " data points are linearly seperable "

##### Principal Component Analysis #####
summary(breastCancer[-ncol(breastCancer)] %>% drop_na)

##  clumpThickness   uniformityCellSize uniformityCellShape marginalAdhesion
##  Min.   : 1.000   Min.   : 1.000     Min.   : 1.000      Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000     1st Qu.: 1.000      1st Qu.: 1.000  
##  Median : 4.000   Median : 1.000     Median : 1.000      Median : 1.000  
##  Mean   : 4.441   Mean   : 3.154     Mean   : 3.218      Mean   : 2.833  
##  3rd Qu.: 6.000   3rd Qu.: 5.000     3rd Qu.: 5.000      3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000     Max.   :10.000      Max.   :10.000  
##  singleEpithelCellSize   bareNuclei     blandChromatin   normalNucleoli  
##  Min.   : 1.000        Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000        1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.000  
##  Median : 2.000        Median : 1.000   Median : 3.000   Median : 1.000  
##  Mean   : 3.236        Mean   : 3.548   Mean   : 3.446   Mean   : 2.872  
##  3rd Qu.: 4.000        3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000        Max.   :10.000   Max.   :10.000   Max.   :10.000  
##     mitoses      
##  Min.   : 1.000  
##  1st Qu.: 1.000  
##  Median : 1.000  
##  Mean   : 1.604  
##  3rd Qu.: 1.000  
##  Max.   :10.000

##                       clumpThickness uniformityCellSize
## clumpThickness             1.0000000          0.6429362
## uniformityCellSize         0.6429362          1.0000000
## uniformityCellShape        0.6539679          0.9071584
## marginalAdhesion           0.4881746          0.7067860
## singleEpithelCellSize      0.5238909          0.7534148
## bareNuclei                 0.5935238          0.6914868
## blandChromatin             0.5538245          0.7556635
## normalNucleoli             0.5344063          0.7191730
## mitoses                    0.3510996          0.4606035
##                       uniformityCellShape marginalAdhesion
## clumpThickness                  0.6539679        0.4881746
## uniformityCellSize              0.9071584        0.7067860
## uniformityCellShape             1.0000000        0.6857348
## marginalAdhesion                0.6857348        1.0000000
## singleEpithelCellSize           0.7223130        0.5943395
## bareNuclei                      0.7136609        0.6704341
## blandChromatin                  0.7354603        0.6686132
## normalNucleoli                  0.7177840        0.6028932
## mitoses                         0.4410959        0.4187345
##                       singleEpithelCellSize bareNuclei blandChromatin
## clumpThickness                    0.5238909  0.5935238      0.5538245
## uniformityCellSize                0.7534148  0.6914868      0.7556635
## uniformityCellShape               0.7223130  0.7136609      0.7354603
## marginalAdhesion                  0.5943395  0.6704341      0.6686132
## singleEpithelCellSize             1.0000000  0.5854889      0.6181347
## bareNuclei                        0.5854889  1.0000000      0.6806888
## blandChromatin                    0.6181347  0.6806888      1.0000000
## normalNucleoli                    0.6287425  0.5840221      0.6656376
## mitoses                           0.4804510  0.3390050      0.3459572
##                       normalNucleoli   mitoses
## clumpThickness             0.5344063 0.3510996
## uniformityCellSize         0.7191730 0.4606035
## uniformityCellShape        0.7177840 0.4410959
## marginalAdhesion           0.6028932 0.4187345
## singleEpithelCellSize      0.6287425 0.4804510
## bareNuclei                 0.5840221 0.3390050
## blandChromatin             0.6656376 0.3459572
## normalNucleoli             1.0000000 0.4336022
## mitoses                    0.4336022 1.0000000

breastCancer.PCA <-
  princomp(breastCancer[-ncol(breastCancer)] %>% drop_na,
           cor = T,
           scores = T)
summary(breastCancer.PCA)

## Importance of components:
##                           Comp.1     Comp.2     Comp.3     Comp.4
## Standard deviation     2.4288437 0.88096641 0.73407879 0.67815051
## Proportion of Variance 0.6554757 0.08623354 0.05987463 0.05109868
## Cumulative Proportion  0.6554757 0.74170928 0.80158391 0.85268259
##                            Comp.5     Comp.6     Comp.7     Comp.8
## Standard deviation     0.61678824 0.54958062 0.54251544 0.51050669
## Proportion of Variance 0.04226975 0.03355987 0.03270256 0.02895745
## Cumulative Proportion  0.89495233 0.92851221 0.96121476 0.99017222
##                             Comp.9
## Standard deviation     0.297405528
## Proportion of Variance 0.009827783
## Cumulative Proportion  1.000000000

## check loadings
loadings(breastCancer.PCA)

## 
## Loadings:
##                       Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## clumpThickness         0.302  0.140  0.866  0.108         0.243       
## uniformityCellSize     0.381               -0.204 -0.146  0.140  0.208
## uniformityCellShape    0.378               -0.176 -0.108         0.131
## marginalAdhesion       0.333        -0.413  0.493         0.654 -0.127
## singleEpithelCellSize  0.336 -0.164        -0.427 -0.637        -0.215
## bareNuclei             0.335  0.261         0.499 -0.124 -0.610 -0.401
## blandChromatin         0.346  0.228 -0.214         0.226 -0.298  0.698
## normalNucleoli         0.336        -0.134 -0.417  0.690        -0.460
## mitoses                0.230 -0.906         0.259  0.105 -0.148  0.133
##                       Comp.8 Comp.9
## clumpThickness         0.249       
## uniformityCellSize    -0.435  0.733
## uniformityCellShape   -0.582 -0.667
## marginalAdhesion       0.162       
## singleEpithelCellSize  0.456       
## bareNuclei            -0.130       
## blandChromatin         0.389       
## normalNucleoli                     
## mitoses                            
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.111  0.111  0.111  0.111  0.111  0.111  0.111  0.111
## Cumulative Var  0.111  0.222  0.333  0.444  0.556  0.667  0.778  0.889
##                Comp.9
## SS loadings     1.000
## Proportion Var  0.111
## Cumulative Var  1.000

## scree plot 
screeplot(breastCancer.PCA, type = 'lines', main = 'Scree Plot')

ggbiplot(breastCancer.PCA)

##### Model Building and Evaluation #####

#### Logistic Regression ####

## convert benign to 0 and mailgnat to 1 in class column
breastCancerNum <- na.omit(breastCancer)
levels(breastCancerNum$class) <- c(0,1)
breastCancerNum$class <- as.numeric(levels(breastCancerNum$class))[breastCancerNum$class]

## split data into test train
set.seed(100)
trainindices <- sample(1:nrow(breastCancerNum), 0.7*nrow(breastCancerNum))

train <- breastCancerNum[trainindices,]
test <- breastCancerNum[-trainindices,]

## necessary libraries
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

## create first model using all the variables in the dataset
logreg.Model1 <- glm(class ~ ., data = train, family = 'binomial')
summary(logreg.Model1)

## 
## Call:
## glm(formula = class ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.28627  -0.05472  -0.02005   0.00301   2.14122  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -14.46156    3.26806  -4.425 9.64e-06 ***
## clumpThickness          0.87477    0.27001   3.240  0.00120 ** 
## uniformityCellSize     -0.11343    0.36445  -0.311  0.75562    
## uniformityCellShape     0.28370    0.41230   0.688  0.49140    
## marginalAdhesion        0.37257    0.18136   2.054  0.03995 *  
## singleEpithelCellSize   0.04411    0.22465   0.196  0.84434    
## bareNuclei              0.44147    0.14172   3.115  0.00184 ** 
## blandChromatin          0.65945    0.24692   2.671  0.00757 ** 
## normalNucleoli          0.46480    0.18952   2.453  0.01419 *  
## mitoses                 1.60276    1.95074   0.822  0.41129    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 615.215  on 476  degrees of freedom
## Residual deviance:  46.239  on 467  degrees of freedom
## AIC: 66.239
## 
## Number of Fisher Scoring iterations: 11

" AIC: 66.239 "

## [1] " AIC: 66.239 "

## use stepAIC for stepwise selection
logreg.Model2 <- stepAIC(logreg.Model1, direction = 'both')

## Start:  AIC=66.24
## class ~ clumpThickness + uniformityCellSize + uniformityCellShape + 
##     marginalAdhesion + singleEpithelCellSize + bareNuclei + blandChromatin + 
##     normalNucleoli + mitoses

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##                         Df Deviance    AIC
## - singleEpithelCellSize  1   46.276 64.276
## - uniformityCellSize     1   46.336 64.336
## - uniformityCellShape    1   46.728 64.728
## <none>                       46.239 66.239
## - mitoses                1   49.227 67.227
## - marginalAdhesion       1   50.296 68.296
## - normalNucleoli         1   54.181 72.181
## - blandChromatin         1   54.722 72.722
## - bareNuclei             1   58.460 76.460
## - clumpThickness         1   62.185 80.185
## 
## Step:  AIC=64.28
## class ~ clumpThickness + uniformityCellSize + uniformityCellShape + 
##     marginalAdhesion + bareNuclei + blandChromatin + normalNucleoli + 
##     mitoses

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##                         Df Deviance    AIC
## - uniformityCellSize     1   46.374 62.374
## - uniformityCellShape    1   46.789 62.789
## <none>                       46.276 64.276
## - mitoses                1   49.303 65.303
## + singleEpithelCellSize  1   46.239 66.239
## - marginalAdhesion       1   50.350 66.350
## - normalNucleoli         1   54.648 70.648
## - blandChromatin         1   54.954 70.954
## - bareNuclei             1   59.252 75.252
## - clumpThickness         1   62.230 78.230
## 
## Step:  AIC=62.37
## class ~ clumpThickness + uniformityCellShape + marginalAdhesion + 
##     bareNuclei + blandChromatin + normalNucleoli + mitoses

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##                         Df Deviance    AIC
## - uniformityCellShape    1   47.076 61.076
## <none>                       46.374 62.374
## - mitoses                1   49.459 63.459
## + uniformityCellSize     1   46.276 64.276
## + singleEpithelCellSize  1   46.336 64.336
## - marginalAdhesion       1   50.397 64.397
## - blandChromatin         1   55.105 69.105
## - normalNucleoli         1   55.266 69.266
## - bareNuclei             1   60.622 74.622
## - clumpThickness         1   62.320 76.320
## 
## Step:  AIC=61.08
## class ~ clumpThickness + marginalAdhesion + bareNuclei + blandChromatin + 
##     normalNucleoli + mitoses

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##                         Df Deviance    AIC
## <none>                       47.076 61.076
## + uniformityCellShape    1   46.374 62.374
## + uniformityCellSize     1   46.789 62.789
## - mitoses                1   50.869 62.869
## + singleEpithelCellSize  1   46.988 62.988
## - marginalAdhesion       1   53.233 65.233
## - normalNucleoli         1   59.650 71.650
## - blandChromatin         1   62.165 74.165
## - bareNuclei             1   70.996 82.996
## - clumpThickness         1   75.352 87.352

summary(logreg.Model2)

## 
## Call:
## glm(formula = class ~ clumpThickness + marginalAdhesion + bareNuclei + 
##     blandChromatin + normalNucleoli + mitoses, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.28081  -0.05283  -0.01808   0.00190   2.41599  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -14.7827     3.1020  -4.766 1.88e-06 ***
## clumpThickness     0.9595     0.2509   3.824 0.000131 ***
## marginalAdhesion   0.4036     0.1657   2.435 0.014875 *  
## bareNuclei         0.5100     0.1223   4.171 3.04e-05 ***
## blandChromatin     0.7337     0.2194   3.344 0.000826 ***
## normalNucleoli     0.5272     0.1743   3.024 0.002491 ** 
## mitoses            1.4627     1.7391   0.841 0.400322    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 615.215  on 476  degrees of freedom
## Residual deviance:  47.076  on 470  degrees of freedom
## AIC: 61.076
## 
## Number of Fisher Scoring iterations: 11

" AIC: 61.076 "

## [1] " AIC: 61.076 "

sort(vif(logreg.Model2))

##          mitoses       bareNuclei   blandChromatin   normalNucleoli 
##         1.056305         1.081025         1.136740         1.166521 
## marginalAdhesion   clumpThickness 
##         1.189952         1.345116

" all the vif are very low so we can check importance of the variables solely based on p-values "

## [1] " all the vif are very low so we can check importance of the variables solely based on p-values "

## removing mitoses based on high p-value
logreg.Model3 <- glm(formula = class ~ clumpThickness + marginalAdhesion + bareNuclei + 
                       blandChromatin + normalNucleoli, family = "binomial", 
                     data = train)
summary(logreg.Model3)

## 
## Call:
## glm(formula = class ~ clumpThickness + marginalAdhesion + bareNuclei + 
##     blandChromatin + normalNucleoli, family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.41708  -0.05025  -0.01409   0.00489   2.31797  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -13.8935     2.4668  -5.632 1.78e-08 ***
## clumpThickness     1.1343     0.2527   4.489 7.15e-06 ***
## marginalAdhesion   0.4128     0.1632   2.529 0.011434 *  
## bareNuclei         0.5042     0.1216   4.146 3.39e-05 ***
## blandChromatin     0.6969     0.2069   3.368 0.000756 ***
## normalNucleoli     0.5342     0.1687   3.167 0.001541 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 615.215  on 476  degrees of freedom
## Residual deviance:  50.869  on 471  degrees of freedom
## AIC: 62.869
## 
## Number of Fisher Scoring iterations: 9

" AIC: 62.869 "

## [1] " AIC: 62.869 "

## removing marginalAdhesion based on p-values
logreg.Model4 <- glm(formula = class ~ clumpThickness + bareNuclei + 
                       blandChromatin + normalNucleoli, family = "binomial", 
                     data = train)
summary(logreg.Model4)

## 
## Call:
## glm(formula = class ~ clumpThickness + bareNuclei + blandChromatin + 
##     normalNucleoli, family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.19237  -0.07252  -0.02052   0.01121   2.82093  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -12.4622     1.9377  -6.431 1.26e-10 ***
## clumpThickness   1.0480     0.2139   4.900 9.60e-07 ***
## bareNuclei       0.5681     0.1161   4.893 9.91e-07 ***
## blandChromatin   0.7251     0.1919   3.779 0.000157 ***
## normalNucleoli   0.5186     0.1582   3.277 0.001048 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 615.215  on 476  degrees of freedom
## Residual deviance:  58.045  on 472  degrees of freedom
## AIC: 68.045
## 
## Number of Fisher Scoring iterations: 9

" AIC: 68.045 "

## [1] " AIC: 68.045 "

" big jump in AIC value "

## [1] " big jump in AIC value "

## removing normalNucleoli based on high p-value
logreg.Model5 <- glm(formula = class ~ clumpThickness + bareNuclei + 
                       blandChromatin, family = "binomial", 
                     data = train)
summary(logreg.Model5)

## 
## Call:
## glm(formula = class ~ clumpThickness + bareNuclei + blandChromatin, 
##     family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.51932  -0.10899  -0.02875   0.01659   2.79223  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -11.2846     1.4933  -7.557 4.13e-14 ***
## clumpThickness   1.0244     0.1877   5.456 4.86e-08 ***
## bareNuclei       0.6227     0.1156   5.385 7.23e-08 ***
## blandChromatin   0.8219     0.1699   4.838 1.31e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 615.215  on 476  degrees of freedom
## Residual deviance:  73.684  on 473  degrees of freedom
## AIC: 81.684
## 
## Number of Fisher Scoring iterations: 8

" AIC: 81.684 "

## [1] " AIC: 81.684 "

" there is a very big jump in AIC which isn't acceptable "

## [1] " there is a very big jump in AIC which isn't acceptable "

### test accuracy on unseen data for model 3 and 4
logreg.Model_semifinal1 <- logreg.Model3
test_pred1 <- predict(logreg.Model_semifinal1, type = "response", 
                      newdata = test)
summary(test_pred1)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000246 0.0004663 0.0046003 0.3468188 0.9897998 0.9999996

plot(test_pred1)

logreg.Model_semifinal2 <- logreg.Model4
test_pred2 <- predict(logreg.Model_semifinal2, type = "response", 
                      newdata = test)
summary(test_pred2)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000676 0.0008211 0.0066400 0.3458105 0.9790580 0.9999985

plot(test_pred2)

" test_pred2 is more distributed whereas test_pred1 is either very close to 1 or 0 "

## [1] " test_pred2 is more distributed whereas test_pred1 is either very close to 1 or 0 "

" we are choosimng logreg.Model_semifinal2 as our current model "

## [1] " we are choosimng logreg.Model_semifinal2 as our current model "

logreg.finalmodel <- logreg.Model_semifinal2
## predict using latest model
test_pred = predict(logreg.finalmodel, type = "response", 
                    newdata = test)
# Let's see the summary 
summary(test_pred)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000676 0.0008211 0.0066400 0.3458105 0.9790580 0.9999985

#  Min.    1st Qu.   Median    Mean    3rd Qu.   Max. 
#0.0000676 0.0008211 0.0066400 0.3458105 0.9790580 0.9999985 

test$prob <- test_pred
View(test)

# Let's use the probability cutoff of 50%.
test_pred_malignant <- factor(ifelse(test_pred >= 0.50, "mailgnat", "benign"))
test_actual_malignant <- factor(ifelse(test$class==1,"mailgnat","benign"))
table(test_actual_malignant,test_pred_malignant)

##                      test_pred_malignant
## test_actual_malignant benign mailgnat
##              benign      127        4
##              mailgnat      7       67

# Let's use the probability cutoff of 40%.
test_pred_malignant <- factor(ifelse(test_pred >= 0.40, "mailgnat", "benign"))
test_actual_malignant <- factor(ifelse(test$class==1,"mailgnat","benign"))
table(test_actual_malignant,test_pred_malignant)

##                      test_pred_malignant
## test_actual_malignant benign mailgnat
##              benign      127        4
##              mailgnat      6       68

# Confusion Matrix for Cut-Off Probability- 40%
test_pred_malignant <- factor(ifelse(test_pred >= 0.40, "mailgnat", "benign"))

test_conf <- confusionMatrix(test_pred_malignant, test_actual_malignant, positive = "mailgnat")
test_conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction benign mailgnat
##   benign      127        6
##   mailgnat      4       68
##                                           
##                Accuracy : 0.9512          
##                  95% CI : (0.9121, 0.9764)
##     No Information Rate : 0.639           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8936          
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.9189          
##             Specificity : 0.9695          
##          Pos Pred Value : 0.9444          
##          Neg Pred Value : 0.9549          
##              Prevalence : 0.3610          
##          Detection Rate : 0.3317          
##    Detection Prevalence : 0.3512          
##       Balanced Accuracy : 0.9442          
##                                           
##        'Positive' Class : mailgnat        
##

#### Let's choose the best cutoff value based on sensitivity and specificity ####

# Let's find out the optimal probalility cutoff 

perform_fn <- function(cutoff) {
  predicted_malignancy <- factor(ifelse(test_pred >= cutoff, "mailgnat", "benign"))
  conf <- confusionMatrix(predicted_malignancy, test_actual_malignant, positive = "mailgnat")
  acc <- conf$overall[1]
  sens <- conf$byClass[1]
  spec <- conf$byClass[2]
  out <- t(as.matrix(c(sens, spec, acc))) 
  colnames(out) <- c("sensitivity", "specificity", "accuracy")
  return(out)
}
summary(test_pred)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000676 0.0008211 0.0066400 0.3458105 0.9790580 0.9999985

# Creating cutoff values from 0.0008211 to 0.9790580 for 
# plotting and initiallizing a matrix of 100 X 3
s = seq(0.01,0.60,length=100)
OUT = matrix(0,100,3)
for(i in 1:100){
  OUT[i,] = perform_fn(s[i])
} 
## plot results
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))

## find cutoff point by chossing m,inimum difference between sensitivity and specificity
cutoff <- s[which(abs(OUT[,1]-OUT[,2]) == min(abs(OUT[,1]-OUT[,2])))]
cutoff

##  [1] 0.08747475 0.09343434 0.09939394 0.10535354 0.11131313 0.11727273
##  [7] 0.12323232 0.12919192 0.13515152 0.14111111

# 0.08747475 0.09343434 0.09939394 0.10535354 0.11131313 0.11727273 0.12323232 0.12919192
# 0.13515152 0.1411111
" so there are multiple cutoff values where difeerence between sensitivity and specificity 
  is minimum. We are taking highest value among them i.e. .141 as our cutoff "

## [1] " so there are multiple cutoff values where difeerence between sensitivity and specificity \n  is minimum. We are taking highest value among them i.e. .141 as our cutoff "

cutoff <- max(cutoff)

## create confusion matrix based on that
test_pred_malignant <- factor(ifelse(test_pred >= cutoff, "mailgnat", "benign"))
conf_final_logr <- confusionMatrix(test_pred_malignant, test_actual_malignant, positive = "mailgnat")

acc <- conf_final_logr$overall[1]
acc

##  Accuracy 
## 0.9609756

sens <- conf_final_logr$byClass[1]
sens

## Sensitivity 
##   0.9594595

spec <- conf_final_logr$byClass[2]
spec

## Specificity 
##   0.9618321

breastCancer.R

rajarshighoshal

Mon Mar 11 00:29:44 2019