library(mlbench)
data(BreastCancer)
dim(BreastCancer)
## [1] 699 11
summary(BreastCancer)
## Id Cl.thickness Cell.size Cell.shape
## Length:699 1 :145 1 :384 1 :353
## Class :character 5 :130 10 : 67 2 : 59
## Mode :character 3 :108 3 : 52 10 : 58
## 4 : 80 2 : 45 3 : 56
## 10 : 69 4 : 40 4 : 44
## 2 : 50 5 : 30 5 : 34
## (Other):117 (Other): 81 (Other): 95
## Marg.adhesion Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli
## 1 :407 2 :386 1 :402 2 :166 1 :443
## 2 : 58 3 : 72 10 :132 3 :165 10 : 61
## 3 : 58 4 : 48 2 : 30 1 :152 3 : 44
## 10 : 55 1 : 47 5 : 30 7 : 73 2 : 36
## 4 : 33 6 : 41 3 : 28 4 : 40 8 : 24
## 8 : 25 5 : 39 (Other): 61 5 : 34 6 : 22
## (Other): 63 (Other): 66 NA's : 16 (Other): 69 (Other): 69
## Mitoses Class
## 1 :579 benign :458
## 2 : 35 malignant:241
## 3 : 33
## 10 : 14
## 4 : 12
## 7 : 9
## (Other): 17
str(BreastCancer)
## 'data.frame': 699 obs. of 11 variables:
## $ Id : chr "1000025" "1002945" "1015425" "1016277" ...
## $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
levels(BreastCancer$Class)
## [1] "benign" "malignant"
head(BreastCancer)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 benign
## 2 10 3 2 1 benign
## 3 2 3 1 1 benign
## 4 4 3 7 1 benign
## 5 1 3 1 1 benign
## 6 10 9 7 1 malignant
BreastCancer$Cl.thickness <- as.numeric(BreastCancer$Cl.thickness)
BreastCancer$Cell.size <- as.numeric(BreastCancer$Cell.size)
BreastCancer$Cell.shape <- as.numeric(BreastCancer$Cell.shape)
BreastCancer$Marg.adhesion <- as.numeric(BreastCancer$Marg.adhesion)
BreastCancer$Epith.c.size <- as.numeric(BreastCancer$Epith.c.size)
BreastCancer$Bare.nuclei <- as.numeric(BreastCancer$Bare.nuclei)
BreastCancer$Bl.cromatin <- as.numeric(BreastCancer$Bl.cromatin)
BreastCancer$Normal.nucleoli <- as.numeric(BreastCancer$Normal.nucleoli)
BreastCancer$Mitoses <- as.numeric(BreastCancer$Mitoses)
BreastCancer$Id <- NULL
BreastCancer = na.omit(BreastCancer)
str(BreastCancer)
## 'data.frame': 683 obs. of 10 variables:
## $ Cl.thickness : num 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : num 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : num 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : num 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : num 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : num 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : num 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: num 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : num 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
library(reshape2)
library(ggplot2)
breastcancer.m = melt(BreastCancer, id.var="Class")
ggplot(data=breastcancer.m, aes(x=Class, y=value)) + geom_boxplot() +facet_wrap(~variable,ncol = 3)

library(corrplot)
bc = cor(BreastCancer[ ,1:9]) #create an object of the features
corrplot.mixed(bc)

set.seed(123) #random number generator
ind = sample(2, nrow(BreastCancer), replace=TRUE, prob=c(0.6, 0.4))
train = BreastCancer[ind==1,]
test = BreastCancer[ind==2,]
str(train)
## 'data.frame': 411 obs. of 10 variables:
## $ Cl.thickness : num 5 3 8 1 2 4 2 1 8 4 ...
## $ Cell.size : num 1 1 10 1 1 2 1 1 7 1 ...
## $ Cell.shape : num 1 1 10 1 1 1 1 1 5 1 ...
## $ Marg.adhesion : num 1 1 8 1 1 1 1 1 10 1 ...
## $ Epith.c.size : num 2 2 7 2 2 2 2 2 7 2 ...
## $ Bare.nuclei : num 1 2 10 10 1 1 1 3 9 1 ...
## $ Bl.cromatin : num 3 3 9 3 1 2 2 3 5 2 ...
## $ Normal.nucleoli: num 1 1 7 1 1 1 1 1 5 1 ...
## $ Mitoses : num 1 1 1 1 5 1 1 1 4 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 2 1 1 1 1 1 2 1 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
str(test)
## 'data.frame': 272 obs. of 10 variables:
## $ Cl.thickness : num 5 6 4 2 1 5 7 6 7 10 ...
## $ Cell.size : num 4 8 1 1 1 3 4 1 3 5 ...
## $ Cell.shape : num 4 8 1 2 1 3 6 1 2 5 ...
## $ Marg.adhesion : num 5 1 3 1 1 3 4 1 10 3 ...
## $ Epith.c.size : num 7 3 2 2 1 2 6 2 5 6 ...
## $ Bare.nuclei : num 10 4 1 1 1 3 1 1 10 7 ...
## $ Bl.cromatin : num 3 3 3 3 3 4 4 3 5 7 ...
## $ Normal.nucleoli: num 2 7 1 1 1 4 3 1 4 10 ...
## $ Mitoses : num 1 1 1 1 1 1 1 1 4 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 2 1 2 2 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
table(train$Class)
##
## benign malignant
## 262 149
table(test$Class)
##
## benign malignant
## 182 90
full.fit = glm(Class~., family=binomial, data=train)
summary(full.fit)
##
## Call:
## glm(formula = Class ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.14730 -0.13192 -0.07123 0.03253 2.54277
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.5627 1.3275 -7.203 5.87e-13 ***
## Cl.thickness 0.4517 0.1536 2.940 0.00328 **
## Cell.size -0.1718 0.2477 -0.694 0.48799
## Cell.shape 0.3913 0.2530 1.547 0.12192
## Marg.adhesion 0.2863 0.1710 1.674 0.09415 .
## Epith.c.size 0.4360 0.2243 1.944 0.05192 .
## Bare.nuclei 0.3875 0.1246 3.110 0.00187 **
## Bl.cromatin 0.1930 0.2152 0.897 0.36979
## Normal.nucleoli 0.2212 0.1562 1.416 0.15685
## Mitoses 0.5041 0.3147 1.602 0.10915
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 538.295 on 410 degrees of freedom
## Residual deviance: 67.545 on 401 degrees of freedom
## AIC: 87.545
##
## Number of Fisher Scoring iterations: 8
confint(full.fit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -12.66144554 -7.3331661
## Cl.thickness 0.16601480 0.7803406
## Cell.size -0.64590918 0.3524918
## Cell.shape -0.11974907 0.9028702
## Marg.adhesion -0.03686037 0.6564497
## Epith.c.size 0.00719916 0.8771961
## Bare.nuclei 0.15129030 0.6466227
## Bl.cromatin -0.21840836 0.6432532
## Normal.nucleoli -0.08317520 0.5470076
## Mitoses -0.06250576 1.0649630
exp(coef(full.fit))
## (Intercept) Cl.thickness Cell.size Cell.shape
## 7.030617e-05 1.570954e+00 8.421386e-01 1.478874e+00
## Marg.adhesion Epith.c.size Bare.nuclei Bl.cromatin
## 1.331490e+00 1.546542e+00 1.473311e+00 1.212936e+00
## Normal.nucleoli Mitoses
## 1.247516e+00 1.655534e+00
library(car)
vif(full.fit)
## Cl.thickness Cell.size Cell.shape Marg.adhesion
## 1.219143 3.151518 2.324207 1.303363
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli
## 1.467540 1.435779 1.583648 1.486191
## Mitoses
## 1.061130
train$probs = predict(full.fit, type="response")
train$probs[1:5]
## [1] 0.014280145 0.008574463 0.999904164 0.072183452 0.018722223
contrasts(train$Class)
## malignant
## benign 0
## malignant 1
train$predict = rep("benign", 411)
train$predict[train$probs>0.5]="malignant"
table(train$predict, train$Class)
##
## benign malignant
## benign 255 5
## malignant 7 144
mean(train$predict==train$Class)
## [1] 0.9708029
test$prob = predict(full.fit, newdata=test, type="response")
test$predict = rep("benign", 272)
test$predict[test$prob>0.5]="malignant"
table(test$predict, test$Class)
##
## benign malignant
## benign 177 6
## malignant 5 84
mean(test$predict==test$Class)
## [1] 0.9595588
library(bestglm)
## Loading required package: leaps
train$y=rep(0,411)
train$y[train$Class=="malignant"]=1
head(train[ ,13])
## [1] 0 0 1 0 0 0
breastcancer.cv = train[ ,-10:-12]
head(breastcancer.cv)
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 5 1 1 1 2
## 3 3 1 1 1 2
## 6 8 10 10 8 7
## 7 1 1 1 1 2
## 9 2 1 1 1 2
## 10 4 2 1 1 2
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses y
## 1 1 3 1 1 0
## 3 2 3 1 1 0
## 6 10 9 7 1 1
## 7 10 3 1 1 0
## 9 1 1 1 5 0
## 10 1 2 1 1 0
bestglm(Xy = breastcancer.cv, IC="CV", CVArgs=list(Method="HTF", K=10,
REP=1), family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## CV(K = 10, REP = 1)
## BICq equivalent for q in (0.000598490115906847, 0.147977113736258)
## Best Model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.1567508 1.07067064 -8.552351 1.206067e-17
## Cl.thickness 0.6919913 0.13051837 5.301869 1.146232e-07
## Epith.c.size 0.8028333 0.16009340 5.014781 5.309404e-07
## Bare.nuclei 0.6097159 0.09390343 6.493010 8.413826e-11
reduce.fit = glm(Class~Cl.thickness+Epith.c.size+Bare.nuclei, family=binomial, data=train)
train$cv.probs = predict(reduce.fit, type="response")
train$cv.predict = rep("benign", 411)
train$cv.predict[train$cv.probs>0.5]="malignant"
table(train$cv.predict, train$Class)
##
## benign malignant
## benign 254 6
## malignant 8 143
test$cv.probs = predict(reduce.fit, newdata=test, type="response")
test$predict = rep("benign", 272)
test$predict[test$cv.probs>0.5]="malignant"
table(test$predict, test$Class)
##
## benign malignant
## benign 178 11
## malignant 4 79
bestglm(Xy= breastcancer.cv, IC="BIC", family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## BIC
## BICq equivalent for q in (0.147977113736258, 0.76937387821992)
## Best Model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.7047252 1.0700450 -8.134915 4.122275e-16
## Cl.thickness 0.5001156 0.1364326 3.665662 2.466995e-04
## Cell.shape 0.4879225 0.1882773 2.591510 9.555573e-03
## Epith.c.size 0.6034520 0.1766281 3.416512 6.342890e-04
## Bare.nuclei 0.4801927 0.1039397 4.619918 3.838921e-06
bic.fit=glm(Class~Cl.thickness+Cell.shape+Epith.c.size+Bare.nuclei, family=binomial, data=train)
test$bic.probs = predict(bic.fit, newdata=test, type="response")
test$bic.predict = rep("benign", 272)
test$bic.predict[test$bic.probs>0.5]="malignant"
table(test$bic.predict, test$Class)
##
## benign malignant
## benign 178 9
## malignant 4 81