library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.0.1 ✔ purrr 0.2.5
## ✔ tidyr 0.8.2 ✔ dplyr 0.7.8
## ✔ readr 1.3.1 ✔ stringr 1.3.1
## ✔ tibble 2.0.1 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
library(scales)
## load data and name columns properly
breastCancer <-
read.csv("~/Desktop/CUproject/breast-cancer-wisconsin.csv",
na.strings = "?")
colnames(breastCancer) <-
c(
"id",
"clumpThickness",
"uniformityCellSize",
"uniformityCellShape",
"marginalAdhesion",
"singleEpithelCellSize",
"bareNuclei",
"blandChromatin",
"normalNucleoli",
"mitoses",
"class"
)
str(breastCancer)
## 'data.frame': 698 obs. of 11 variables:
## $ id : int 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 1035283 ...
## $ clumpThickness : int 5 3 6 4 8 1 2 2 4 1 ...
## $ uniformityCellSize : int 4 1 8 1 10 1 1 1 2 1 ...
## $ uniformityCellShape : int 4 1 8 1 10 1 2 1 1 1 ...
## $ marginalAdhesion : int 5 1 1 3 8 1 1 1 1 1 ...
## $ singleEpithelCellSize: int 7 2 3 2 7 2 2 2 2 1 ...
## $ bareNuclei : int 10 2 4 1 10 10 1 1 1 1 ...
## $ blandChromatin : int 3 3 3 3 9 3 3 1 2 3 ...
## $ normalNucleoli : int 2 1 7 1 7 1 1 1 1 1 ...
## $ mitoses : int 1 1 1 1 1 1 1 5 1 1 ...
## $ class : int 2 2 2 2 4 2 2 2 2 2 ...
## remove id
breastCancer$id <- NULL
## see summary
summary(breastCancer)
## clumpThickness uniformityCellSize uniformityCellShape marginalAdhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.000
## Mean : 4.417 Mean : 3.138 Mean : 3.211 Mean : 2.809
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
##
## singleEpithelCellSize bareNuclei blandChromatin normalNucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.00
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.00
## Mean : 3.218 Mean : 3.548 Mean : 3.438 Mean : 2.87
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 4.00
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.00
## NA's :16
## mitoses class
## Min. : 1.00 Min. :2.000
## 1st Qu.: 1.00 1st Qu.:2.000
## Median : 1.00 Median :2.000
## Mean : 1.59 Mean :2.691
## 3rd Qu.: 1.00 3rd Qu.:4.000
## Max. :10.00 Max. :4.000
##
" only bare nuclei has 16 NAs "
## [1] " only bare nuclei has 16 NAs "
########## EDA #########
breastCancer$class <- as.factor(breastCancer$class)
levels(breastCancer$class)
## [1] "2" "4"
levels(breastCancer$class) <- c("benign", "malignant")
### number of benign and malignant tumors
total <- nrow(breastCancer) # 698
benign <- length(which(breastCancer$class == "benign")) # 457
malignant <- length(which(breastCancer$class == "malignant")) # 241
benign / total * 100
## [1] 65.47278
malignant / total * 100
## [1] 34.52722
" So, out of 689 rows, 457 i.e. 65.47% are benign and 241 i.e. 34.53% is malignant "
## [1] " So, out of 689 rows, 457 i.e. 65.47% are benign and 241 i.e. 34.53% is malignant "
" though the data is little off-balancced it is workable "
## [1] " though the data is little off-balancced it is workable "
##### univariate analysis #####
univariate <- function(dt, variable, var_name) {
ggplot(dt, aes(fct_infreq(variable))) +
geom_bar(aes(y = (..count..) / sum(..count..)), fill = "coral") +
geom_text(aes(y = ((..count..) / sum(..count..)), label = scales::percent((..count..) /
sum(..count..))),
stat = "count",
vjust = -0.15) +
scale_y_continuous(labels = percent) +
labs(title = var_name, x = var_name, y = "Percent") +
theme(
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text()
)
}
### class
univariate(breastCancer, breastCancer$class, "Rate of Malignant")

" 34.5% malignant, 65.5% benign "
## [1] " 34.5% malignant, 65.5% benign "
### clump thikness
univariate(breastCancer,
as.factor(breastCancer$clumpThickness),
"Clump Thickness")

" highest 1 and lowest 9"
## [1] " highest 1 and lowest 9"
### Uniformity of Cell Size
univariate(breastCancer,
as.factor(breastCancer$uniformityCellSize),
"Uniformity of Cell Size ")

" 1 has very high value (54.9%) compared to others, so it could be treated as outlier "
## [1] " 1 has very high value (54.9%) compared to others, so it could be treated as outlier "
### Uniformity of Cell Shape
univariate(
breastCancer,
as.factor(breastCancer$uniformityCellShape),
"Uniformity of Cell Shape "
)

" 1 has very high value (50.4%)n compared to others, could possibly be outlier "
## [1] " 1 has very high value (50.4%)n compared to others, could possibly be outlier "
### Marginal Adhesion
univariate(breastCancer,
as.factor(breastCancer$marginalAdhesion),
"Marginal Adhesion")

" 1 has most common value (58.2%) "
## [1] " 1 has most common value (58.2%) "
### Single Epithelial Cell Size
univariate(
breastCancer,
as.factor(breastCancer$singleEpithelCellSize),
"Single Epithelial Cell Size"
)

" 2 has highest value (55.2%) "
## [1] " 2 has highest value (55.2%) "
### Bare Nuclei
univariate(breastCancer,
as.factor(breastCancer$bareNuclei),
"Bare Nuclei")

" 1 has highest value(57.4%), 2.3% NA"
## [1] " 1 has highest value(57.4%), 2.3% NA"
### Bland Chromatin
univariate(breastCancer,
as.factor(breastCancer$blandChromatin),
"Bland Chromatin")

" 2,3,1 has more or less same values, more evenly distributed than other variables "
## [1] " 2,3,1 has more or less same values, more evenly distributed than other variables "
### Normal Nucleoli
univariate(breastCancer,
as.factor(breastCancer$normalNucleoli),
"Normal Nucleoli")

" 1 has 63.3% values "
## [1] " 1 has 63.3% values "
### Mitoses
univariate(breastCancer, as.factor(breastCancer$mitoses), "Mitoses")

" 1 has 82.8% values "
## [1] " 1 has 82.8% values "
#### box plot of the variables ####
### wrt clump thikness
ggplot(breastCancer, aes(x = class, y = clumpThickness)) + geom_boxplot(aes(fill = class))

" there is a clear relation between higher clump thickness and malignancy "
## [1] " there is a clear relation between higher clump thickness and malignancy "
### wrt Uniformity of Cell Size
ggplot(breastCancer, aes(x = class, y = uniformityCellSize)) + geom_boxplot(aes(fill = class))

" except some outliers, higher uniformioty of cell size is linked with higher chane of malignancy "
## [1] " except some outliers, higher uniformioty of cell size is linked with higher chane of malignancy "
### wrt Uniformity of Cell Shape
ggplot(breastCancer, aes(x = class, y = uniformityCellShape)) + geom_boxplot(aes(fill = class))

" uniformity of cell shape has very identical result with uniformity cell size "
## [1] " uniformity of cell shape has very identical result with uniformity cell size "
cor(breastCancer$uniformityCellSize,
breastCancer$uniformityCellShape)
## [1] 0.9068137
" .91 correlation; highly correlated "
## [1] " .91 correlation; highly correlated "
### wrt Marginal Adhesion
ggplot(breastCancer, aes(x = class, y = marginalAdhesion)) + geom_boxplot(aes(fill = class))

" higher marginal adhesion is linked with higher malignancy "
## [1] " higher marginal adhesion is linked with higher malignancy "
### wrt Single Epithelial Cell Size
ggplot(breastCancer, aes(x = class, y = singleEpithelCellSize)) + geom_boxplot(aes(fill = class))

" Here too with higher single epithelial cell size chances of mal;ignancy increases "
## [1] " Here too with higher single epithelial cell size chances of mal;ignancy increases "
### wrt Bare Nuclei
ggplot(breastCancer, aes(x = class, y = bareNuclei)) + geom_boxplot(aes(fill = class))
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).

" there is a very high correlation between bare nuclei and malignancy "
## [1] " there is a very high correlation between bare nuclei and malignancy "
### wrt Bland Chromatin
ggplot(breastCancer, aes(x = class, y = blandChromatin)) + geom_boxplot(aes(fill = class))

" higher bland chromatin correlates to higher malignancy "
## [1] " higher bland chromatin correlates to higher malignancy "
### wrt Normal Nucleoli
ggplot(breastCancer, aes(x = class, y = normalNucleoli)) + geom_boxplot(aes(fill = class))

" higer value correlates to higher chance of malignancy "
## [1] " higer value correlates to higher chance of malignancy "
### wrt Mitoses
ggplot(breastCancer, aes(x = class, y = mitoses)) + geom_boxplot(aes(fill = class))

" here there isn't much significant difference; butr malignants tend to have more tendency of mitoses"
## [1] " here there isn't much significant difference; butr malignants tend to have more tendency of mitoses"
##### bivariate analysis #####
### correlation plot
M <- cor(breastCancer[-ncol(breastCancer)] %>% drop_na())
col1 <-
colorRampPalette(
c(
"#7F0000",
"red",
"#FF7F00",
"yellow",
"white",
"cyan",
"#007FFF",
"blue",
"#00007F"
)
)
library(corrplot)
## corrplot 0.84 loaded
corrplot(
M,
method = "color",
cl.lim = c(-1, 1),
col = col1(100),
addgrid.col = "black"
)

" all of the columns have positive correlation "
## [1] " all of the columns have positive correlation "
## uniformity cell shape vs uniformity cell size
ggplot(breastCancer) + geom_boxplot(aes(x = uniformityCellSize, y = uniformityCellShape, fill = class)) +
geom_line(
data = (
breastCancer %>% group_by(uniformityCellSize) %>%
summarize(avg = mean(uniformityCellShape))
),
aes(x = uniformityCellSize, y = avg, group = 1)
)

" Though there is mostly linear trend, there are some fluctuation, so it it better to keep both of them "
## [1] " Though there is mostly linear trend, there are some fluctuation, so it it better to keep both of them "
### scatter plot
## uniformity of cell size and shape
ggplot(breastCancer) + geom_point(aes(x = uniformityCellSize, y = uniformityCellShape, col = class))

" data points are linearly seperable "
## [1] " data points are linearly seperable "
##### Principal Component Analysis #####
summary(breastCancer[-ncol(breastCancer)] %>% drop_na)
## clumpThickness uniformityCellSize uniformityCellShape marginalAdhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.000
## Mean : 4.441 Mean : 3.154 Mean : 3.218 Mean : 2.833
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## singleEpithelCellSize bareNuclei blandChromatin normalNucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.000
## Mean : 3.236 Mean : 3.548 Mean : 3.446 Mean : 2.872
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## mitoses
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 1.000
## Mean : 1.604
## 3rd Qu.: 1.000
## Max. :10.000
M
## clumpThickness uniformityCellSize
## clumpThickness 1.0000000 0.6429362
## uniformityCellSize 0.6429362 1.0000000
## uniformityCellShape 0.6539679 0.9071584
## marginalAdhesion 0.4881746 0.7067860
## singleEpithelCellSize 0.5238909 0.7534148
## bareNuclei 0.5935238 0.6914868
## blandChromatin 0.5538245 0.7556635
## normalNucleoli 0.5344063 0.7191730
## mitoses 0.3510996 0.4606035
## uniformityCellShape marginalAdhesion
## clumpThickness 0.6539679 0.4881746
## uniformityCellSize 0.9071584 0.7067860
## uniformityCellShape 1.0000000 0.6857348
## marginalAdhesion 0.6857348 1.0000000
## singleEpithelCellSize 0.7223130 0.5943395
## bareNuclei 0.7136609 0.6704341
## blandChromatin 0.7354603 0.6686132
## normalNucleoli 0.7177840 0.6028932
## mitoses 0.4410959 0.4187345
## singleEpithelCellSize bareNuclei blandChromatin
## clumpThickness 0.5238909 0.5935238 0.5538245
## uniformityCellSize 0.7534148 0.6914868 0.7556635
## uniformityCellShape 0.7223130 0.7136609 0.7354603
## marginalAdhesion 0.5943395 0.6704341 0.6686132
## singleEpithelCellSize 1.0000000 0.5854889 0.6181347
## bareNuclei 0.5854889 1.0000000 0.6806888
## blandChromatin 0.6181347 0.6806888 1.0000000
## normalNucleoli 0.6287425 0.5840221 0.6656376
## mitoses 0.4804510 0.3390050 0.3459572
## normalNucleoli mitoses
## clumpThickness 0.5344063 0.3510996
## uniformityCellSize 0.7191730 0.4606035
## uniformityCellShape 0.7177840 0.4410959
## marginalAdhesion 0.6028932 0.4187345
## singleEpithelCellSize 0.6287425 0.4804510
## bareNuclei 0.5840221 0.3390050
## blandChromatin 0.6656376 0.3459572
## normalNucleoli 1.0000000 0.4336022
## mitoses 0.4336022 1.0000000
breastCancer.PCA <-
princomp(breastCancer[-ncol(breastCancer)] %>% drop_na,
cor = T,
scores = T)
summary(breastCancer.PCA)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 2.4288437 0.88096641 0.73407879 0.67815051
## Proportion of Variance 0.6554757 0.08623354 0.05987463 0.05109868
## Cumulative Proportion 0.6554757 0.74170928 0.80158391 0.85268259
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.61678824 0.54958062 0.54251544 0.51050669
## Proportion of Variance 0.04226975 0.03355987 0.03270256 0.02895745
## Cumulative Proportion 0.89495233 0.92851221 0.96121476 0.99017222
## Comp.9
## Standard deviation 0.297405528
## Proportion of Variance 0.009827783
## Cumulative Proportion 1.000000000
## check loadings
loadings(breastCancer.PCA)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## clumpThickness 0.302 0.140 0.866 0.108 0.243
## uniformityCellSize 0.381 -0.204 -0.146 0.140 0.208
## uniformityCellShape 0.378 -0.176 -0.108 0.131
## marginalAdhesion 0.333 -0.413 0.493 0.654 -0.127
## singleEpithelCellSize 0.336 -0.164 -0.427 -0.637 -0.215
## bareNuclei 0.335 0.261 0.499 -0.124 -0.610 -0.401
## blandChromatin 0.346 0.228 -0.214 0.226 -0.298 0.698
## normalNucleoli 0.336 -0.134 -0.417 0.690 -0.460
## mitoses 0.230 -0.906 0.259 0.105 -0.148 0.133
## Comp.8 Comp.9
## clumpThickness 0.249
## uniformityCellSize -0.435 0.733
## uniformityCellShape -0.582 -0.667
## marginalAdhesion 0.162
## singleEpithelCellSize 0.456
## bareNuclei -0.130
## blandChromatin 0.389
## normalNucleoli
## mitoses
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.111 0.111 0.111 0.111 0.111 0.111 0.111 0.111
## Cumulative Var 0.111 0.222 0.333 0.444 0.556 0.667 0.778 0.889
## Comp.9
## SS loadings 1.000
## Proportion Var 0.111
## Cumulative Var 1.000
## scree plot
screeplot(breastCancer.PCA, type = 'lines', main = 'Scree Plot')

ggbiplot(breastCancer.PCA)

##### Model Building and Evaluation #####
#### Logistic Regression ####
## convert benign to 0 and mailgnat to 1 in class column
breastCancerNum <- na.omit(breastCancer)
levels(breastCancerNum$class) <- c(0,1)
breastCancerNum$class <- as.numeric(levels(breastCancerNum$class))[breastCancerNum$class]
## split data into test train
set.seed(100)
trainindices <- sample(1:nrow(breastCancerNum), 0.7*nrow(breastCancerNum))
train <- breastCancerNum[trainindices,]
test <- breastCancerNum[-trainindices,]
## necessary libraries
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## create first model using all the variables in the dataset
logreg.Model1 <- glm(class ~ ., data = train, family = 'binomial')
summary(logreg.Model1)
##
## Call:
## glm(formula = class ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.28627 -0.05472 -0.02005 0.00301 2.14122
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.46156 3.26806 -4.425 9.64e-06 ***
## clumpThickness 0.87477 0.27001 3.240 0.00120 **
## uniformityCellSize -0.11343 0.36445 -0.311 0.75562
## uniformityCellShape 0.28370 0.41230 0.688 0.49140
## marginalAdhesion 0.37257 0.18136 2.054 0.03995 *
## singleEpithelCellSize 0.04411 0.22465 0.196 0.84434
## bareNuclei 0.44147 0.14172 3.115 0.00184 **
## blandChromatin 0.65945 0.24692 2.671 0.00757 **
## normalNucleoli 0.46480 0.18952 2.453 0.01419 *
## mitoses 1.60276 1.95074 0.822 0.41129
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 615.215 on 476 degrees of freedom
## Residual deviance: 46.239 on 467 degrees of freedom
## AIC: 66.239
##
## Number of Fisher Scoring iterations: 11
" AIC: 66.239 "
## [1] " AIC: 66.239 "
## use stepAIC for stepwise selection
logreg.Model2 <- stepAIC(logreg.Model1, direction = 'both')
## Start: AIC=66.24
## class ~ clumpThickness + uniformityCellSize + uniformityCellShape +
## marginalAdhesion + singleEpithelCellSize + bareNuclei + blandChromatin +
## normalNucleoli + mitoses
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - singleEpithelCellSize 1 46.276 64.276
## - uniformityCellSize 1 46.336 64.336
## - uniformityCellShape 1 46.728 64.728
## <none> 46.239 66.239
## - mitoses 1 49.227 67.227
## - marginalAdhesion 1 50.296 68.296
## - normalNucleoli 1 54.181 72.181
## - blandChromatin 1 54.722 72.722
## - bareNuclei 1 58.460 76.460
## - clumpThickness 1 62.185 80.185
##
## Step: AIC=64.28
## class ~ clumpThickness + uniformityCellSize + uniformityCellShape +
## marginalAdhesion + bareNuclei + blandChromatin + normalNucleoli +
## mitoses
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - uniformityCellSize 1 46.374 62.374
## - uniformityCellShape 1 46.789 62.789
## <none> 46.276 64.276
## - mitoses 1 49.303 65.303
## + singleEpithelCellSize 1 46.239 66.239
## - marginalAdhesion 1 50.350 66.350
## - normalNucleoli 1 54.648 70.648
## - blandChromatin 1 54.954 70.954
## - bareNuclei 1 59.252 75.252
## - clumpThickness 1 62.230 78.230
##
## Step: AIC=62.37
## class ~ clumpThickness + uniformityCellShape + marginalAdhesion +
## bareNuclei + blandChromatin + normalNucleoli + mitoses
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - uniformityCellShape 1 47.076 61.076
## <none> 46.374 62.374
## - mitoses 1 49.459 63.459
## + uniformityCellSize 1 46.276 64.276
## + singleEpithelCellSize 1 46.336 64.336
## - marginalAdhesion 1 50.397 64.397
## - blandChromatin 1 55.105 69.105
## - normalNucleoli 1 55.266 69.266
## - bareNuclei 1 60.622 74.622
## - clumpThickness 1 62.320 76.320
##
## Step: AIC=61.08
## class ~ clumpThickness + marginalAdhesion + bareNuclei + blandChromatin +
## normalNucleoli + mitoses
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 47.076 61.076
## + uniformityCellShape 1 46.374 62.374
## + uniformityCellSize 1 46.789 62.789
## - mitoses 1 50.869 62.869
## + singleEpithelCellSize 1 46.988 62.988
## - marginalAdhesion 1 53.233 65.233
## - normalNucleoli 1 59.650 71.650
## - blandChromatin 1 62.165 74.165
## - bareNuclei 1 70.996 82.996
## - clumpThickness 1 75.352 87.352
summary(logreg.Model2)
##
## Call:
## glm(formula = class ~ clumpThickness + marginalAdhesion + bareNuclei +
## blandChromatin + normalNucleoli + mitoses, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.28081 -0.05283 -0.01808 0.00190 2.41599
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.7827 3.1020 -4.766 1.88e-06 ***
## clumpThickness 0.9595 0.2509 3.824 0.000131 ***
## marginalAdhesion 0.4036 0.1657 2.435 0.014875 *
## bareNuclei 0.5100 0.1223 4.171 3.04e-05 ***
## blandChromatin 0.7337 0.2194 3.344 0.000826 ***
## normalNucleoli 0.5272 0.1743 3.024 0.002491 **
## mitoses 1.4627 1.7391 0.841 0.400322
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 615.215 on 476 degrees of freedom
## Residual deviance: 47.076 on 470 degrees of freedom
## AIC: 61.076
##
## Number of Fisher Scoring iterations: 11
" AIC: 61.076 "
## [1] " AIC: 61.076 "
sort(vif(logreg.Model2))
## mitoses bareNuclei blandChromatin normalNucleoli
## 1.056305 1.081025 1.136740 1.166521
## marginalAdhesion clumpThickness
## 1.189952 1.345116
" all the vif are very low so we can check importance of the variables solely based on p-values "
## [1] " all the vif are very low so we can check importance of the variables solely based on p-values "
## removing mitoses based on high p-value
logreg.Model3 <- glm(formula = class ~ clumpThickness + marginalAdhesion + bareNuclei +
blandChromatin + normalNucleoli, family = "binomial",
data = train)
summary(logreg.Model3)
##
## Call:
## glm(formula = class ~ clumpThickness + marginalAdhesion + bareNuclei +
## blandChromatin + normalNucleoli, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.41708 -0.05025 -0.01409 0.00489 2.31797
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.8935 2.4668 -5.632 1.78e-08 ***
## clumpThickness 1.1343 0.2527 4.489 7.15e-06 ***
## marginalAdhesion 0.4128 0.1632 2.529 0.011434 *
## bareNuclei 0.5042 0.1216 4.146 3.39e-05 ***
## blandChromatin 0.6969 0.2069 3.368 0.000756 ***
## normalNucleoli 0.5342 0.1687 3.167 0.001541 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 615.215 on 476 degrees of freedom
## Residual deviance: 50.869 on 471 degrees of freedom
## AIC: 62.869
##
## Number of Fisher Scoring iterations: 9
" AIC: 62.869 "
## [1] " AIC: 62.869 "
## removing marginalAdhesion based on p-values
logreg.Model4 <- glm(formula = class ~ clumpThickness + bareNuclei +
blandChromatin + normalNucleoli, family = "binomial",
data = train)
summary(logreg.Model4)
##
## Call:
## glm(formula = class ~ clumpThickness + bareNuclei + blandChromatin +
## normalNucleoli, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.19237 -0.07252 -0.02052 0.01121 2.82093
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.4622 1.9377 -6.431 1.26e-10 ***
## clumpThickness 1.0480 0.2139 4.900 9.60e-07 ***
## bareNuclei 0.5681 0.1161 4.893 9.91e-07 ***
## blandChromatin 0.7251 0.1919 3.779 0.000157 ***
## normalNucleoli 0.5186 0.1582 3.277 0.001048 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 615.215 on 476 degrees of freedom
## Residual deviance: 58.045 on 472 degrees of freedom
## AIC: 68.045
##
## Number of Fisher Scoring iterations: 9
" AIC: 68.045 "
## [1] " AIC: 68.045 "
" big jump in AIC value "
## [1] " big jump in AIC value "
## removing normalNucleoli based on high p-value
logreg.Model5 <- glm(formula = class ~ clumpThickness + bareNuclei +
blandChromatin, family = "binomial",
data = train)
summary(logreg.Model5)
##
## Call:
## glm(formula = class ~ clumpThickness + bareNuclei + blandChromatin,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.51932 -0.10899 -0.02875 0.01659 2.79223
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.2846 1.4933 -7.557 4.13e-14 ***
## clumpThickness 1.0244 0.1877 5.456 4.86e-08 ***
## bareNuclei 0.6227 0.1156 5.385 7.23e-08 ***
## blandChromatin 0.8219 0.1699 4.838 1.31e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 615.215 on 476 degrees of freedom
## Residual deviance: 73.684 on 473 degrees of freedom
## AIC: 81.684
##
## Number of Fisher Scoring iterations: 8
" AIC: 81.684 "
## [1] " AIC: 81.684 "
" there is a very big jump in AIC which isn't acceptable "
## [1] " there is a very big jump in AIC which isn't acceptable "
### test accuracy on unseen data for model 3 and 4
logreg.Model_semifinal1 <- logreg.Model3
test_pred1 <- predict(logreg.Model_semifinal1, type = "response",
newdata = test)
summary(test_pred1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000246 0.0004663 0.0046003 0.3468188 0.9897998 0.9999996
plot(test_pred1)

logreg.Model_semifinal2 <- logreg.Model4
test_pred2 <- predict(logreg.Model_semifinal2, type = "response",
newdata = test)
summary(test_pred2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000676 0.0008211 0.0066400 0.3458105 0.9790580 0.9999985
plot(test_pred2)

" test_pred2 is more distributed whereas test_pred1 is either very close to 1 or 0 "
## [1] " test_pred2 is more distributed whereas test_pred1 is either very close to 1 or 0 "
" we are choosimng logreg.Model_semifinal2 as our current model "
## [1] " we are choosimng logreg.Model_semifinal2 as our current model "
logreg.finalmodel <- logreg.Model_semifinal2
## predict using latest model
test_pred = predict(logreg.finalmodel, type = "response",
newdata = test)
# Let's see the summary
summary(test_pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000676 0.0008211 0.0066400 0.3458105 0.9790580 0.9999985
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#0.0000676 0.0008211 0.0066400 0.3458105 0.9790580 0.9999985
test$prob <- test_pred
View(test)
# Let's use the probability cutoff of 50%.
test_pred_malignant <- factor(ifelse(test_pred >= 0.50, "mailgnat", "benign"))
test_actual_malignant <- factor(ifelse(test$class==1,"mailgnat","benign"))
table(test_actual_malignant,test_pred_malignant)
## test_pred_malignant
## test_actual_malignant benign mailgnat
## benign 127 4
## mailgnat 7 67
# Let's use the probability cutoff of 40%.
test_pred_malignant <- factor(ifelse(test_pred >= 0.40, "mailgnat", "benign"))
test_actual_malignant <- factor(ifelse(test$class==1,"mailgnat","benign"))
table(test_actual_malignant,test_pred_malignant)
## test_pred_malignant
## test_actual_malignant benign mailgnat
## benign 127 4
## mailgnat 6 68
# Confusion Matrix for Cut-Off Probability- 40%
test_pred_malignant <- factor(ifelse(test_pred >= 0.40, "mailgnat", "benign"))
test_conf <- confusionMatrix(test_pred_malignant, test_actual_malignant, positive = "mailgnat")
test_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign mailgnat
## benign 127 6
## mailgnat 4 68
##
## Accuracy : 0.9512
## 95% CI : (0.9121, 0.9764)
## No Information Rate : 0.639
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8936
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.9189
## Specificity : 0.9695
## Pos Pred Value : 0.9444
## Neg Pred Value : 0.9549
## Prevalence : 0.3610
## Detection Rate : 0.3317
## Detection Prevalence : 0.3512
## Balanced Accuracy : 0.9442
##
## 'Positive' Class : mailgnat
##
#### Let's choose the best cutoff value based on sensitivity and specificity ####
# Let's find out the optimal probalility cutoff
perform_fn <- function(cutoff) {
predicted_malignancy <- factor(ifelse(test_pred >= cutoff, "mailgnat", "benign"))
conf <- confusionMatrix(predicted_malignancy, test_actual_malignant, positive = "mailgnat")
acc <- conf$overall[1]
sens <- conf$byClass[1]
spec <- conf$byClass[2]
out <- t(as.matrix(c(sens, spec, acc)))
colnames(out) <- c("sensitivity", "specificity", "accuracy")
return(out)
}
summary(test_pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000676 0.0008211 0.0066400 0.3458105 0.9790580 0.9999985
# Creating cutoff values from 0.0008211 to 0.9790580 for
# plotting and initiallizing a matrix of 100 X 3
s = seq(0.01,0.60,length=100)
OUT = matrix(0,100,3)
for(i in 1:100){
OUT[i,] = perform_fn(s[i])
}
## plot results
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))

## find cutoff point by chossing m,inimum difference between sensitivity and specificity
cutoff <- s[which(abs(OUT[,1]-OUT[,2]) == min(abs(OUT[,1]-OUT[,2])))]
cutoff
## [1] 0.08747475 0.09343434 0.09939394 0.10535354 0.11131313 0.11727273
## [7] 0.12323232 0.12919192 0.13515152 0.14111111
# 0.08747475 0.09343434 0.09939394 0.10535354 0.11131313 0.11727273 0.12323232 0.12919192
# 0.13515152 0.1411111
" so there are multiple cutoff values where difeerence between sensitivity and specificity
is minimum. We are taking highest value among them i.e. .141 as our cutoff "
## [1] " so there are multiple cutoff values where difeerence between sensitivity and specificity \n is minimum. We are taking highest value among them i.e. .141 as our cutoff "
cutoff <- max(cutoff)
## create confusion matrix based on that
test_pred_malignant <- factor(ifelse(test_pred >= cutoff, "mailgnat", "benign"))
conf_final_logr <- confusionMatrix(test_pred_malignant, test_actual_malignant, positive = "mailgnat")
acc <- conf_final_logr$overall[1]
acc
## Accuracy
## 0.9609756
sens <- conf_final_logr$byClass[1]
sens
## Sensitivity
## 0.9594595
spec <- conf_final_logr$byClass[2]
spec
## Specificity
## 0.9618321