library(mlbench)
data(BreastCancer)
dim(BreastCancer)
## [1] 699  11
summary(BreastCancer)
##       Id             Cl.thickness   Cell.size     Cell.shape 
##  Length:699         1      :145   1      :384   1      :353  
##  Class :character   5      :130   10     : 67   2      : 59  
##  Mode  :character   3      :108   3      : 52   10     : 58  
##                     4      : 80   2      : 45   3      : 56  
##                     10     : 69   4      : 40   4      : 44  
##                     2      : 50   5      : 30   5      : 34  
##                     (Other):117   (Other): 81   (Other): 95  
##  Marg.adhesion  Epith.c.size  Bare.nuclei   Bl.cromatin  Normal.nucleoli
##  1      :407   2      :386   1      :402   2      :166   1      :443    
##  2      : 58   3      : 72   10     :132   3      :165   10     : 61    
##  3      : 58   4      : 48   2      : 30   1      :152   3      : 44    
##  10     : 55   1      : 47   5      : 30   7      : 73   2      : 36    
##  4      : 33   6      : 41   3      : 28   4      : 40   8      : 24    
##  8      : 25   5      : 39   (Other): 61   5      : 34   6      : 22    
##  (Other): 63   (Other): 66   NA's   : 16   (Other): 69   (Other): 69    
##     Mitoses          Class    
##  1      :579   benign   :458  
##  2      : 35   malignant:241  
##  3      : 33                  
##  10     : 14                  
##  4      : 12                  
##  7      :  9                  
##  (Other): 17
str(BreastCancer)
## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
levels(BreastCancer$Class)
## [1] "benign"    "malignant"
head(BreastCancer)
##        Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025            5         1          1             1            2
## 2 1002945            5         4          4             5            7
## 3 1015425            3         1          1             1            2
## 4 1016277            6         8          8             1            3
## 5 1017023            4         1          1             3            2
## 6 1017122            8        10         10             8            7
##   Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses     Class
## 1           1           3               1       1    benign
## 2          10           3               2       1    benign
## 3           2           3               1       1    benign
## 4           4           3               7       1    benign
## 5           1           3               1       1    benign
## 6          10           9               7       1 malignant
BreastCancer$Cl.thickness <- as.numeric(BreastCancer$Cl.thickness)
BreastCancer$Cell.size <- as.numeric(BreastCancer$Cell.size)
BreastCancer$Cell.shape <- as.numeric(BreastCancer$Cell.shape)
BreastCancer$Marg.adhesion <- as.numeric(BreastCancer$Marg.adhesion)
BreastCancer$Epith.c.size <- as.numeric(BreastCancer$Epith.c.size)
BreastCancer$Bare.nuclei <- as.numeric(BreastCancer$Bare.nuclei)
BreastCancer$Bl.cromatin <- as.numeric(BreastCancer$Bl.cromatin)
BreastCancer$Normal.nucleoli <- as.numeric(BreastCancer$Normal.nucleoli)
BreastCancer$Mitoses <- as.numeric(BreastCancer$Mitoses)

BreastCancer$Id <- NULL
BreastCancer = na.omit(BreastCancer)
str(BreastCancer)
## 'data.frame':    683 obs. of  10 variables:
##  $ Cl.thickness   : num  5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : num  1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : num  1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : num  1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : num  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : num  1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : num  3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: num  1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : num  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
library(reshape2)
library(ggplot2)
breastcancer.m = melt(BreastCancer, id.var="Class")
ggplot(data=breastcancer.m, aes(x=Class, y=value)) + geom_boxplot() +facet_wrap(~variable,ncol = 3)

library(corrplot)
bc = cor(BreastCancer[ ,1:9]) #create an object of the features
corrplot.mixed(bc)

set.seed(123) #random number generator
ind = sample(2, nrow(BreastCancer), replace=TRUE, prob=c(0.6, 0.4))
train = BreastCancer[ind==1,]
test = BreastCancer[ind==2,]
str(train)
## 'data.frame':    411 obs. of  10 variables:
##  $ Cl.thickness   : num  5 3 8 1 2 4 2 1 8 4 ...
##  $ Cell.size      : num  1 1 10 1 1 2 1 1 7 1 ...
##  $ Cell.shape     : num  1 1 10 1 1 1 1 1 5 1 ...
##  $ Marg.adhesion  : num  1 1 8 1 1 1 1 1 10 1 ...
##  $ Epith.c.size   : num  2 2 7 2 2 2 2 2 7 2 ...
##  $ Bare.nuclei    : num  1 2 10 10 1 1 1 3 9 1 ...
##  $ Bl.cromatin    : num  3 3 9 3 1 2 2 3 5 2 ...
##  $ Normal.nucleoli: num  1 1 7 1 1 1 1 1 5 1 ...
##  $ Mitoses        : num  1 1 1 1 5 1 1 1 4 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 2 1 1 1 1 1 2 1 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
str(test)
## 'data.frame':    272 obs. of  10 variables:
##  $ Cl.thickness   : num  5 6 4 2 1 5 7 6 7 10 ...
##  $ Cell.size      : num  4 8 1 1 1 3 4 1 3 5 ...
##  $ Cell.shape     : num  4 8 1 2 1 3 6 1 2 5 ...
##  $ Marg.adhesion  : num  5 1 3 1 1 3 4 1 10 3 ...
##  $ Epith.c.size   : num  7 3 2 2 1 2 6 2 5 6 ...
##  $ Bare.nuclei    : num  10 4 1 1 1 3 1 1 10 7 ...
##  $ Bl.cromatin    : num  3 3 3 3 3 4 4 3 5 7 ...
##  $ Normal.nucleoli: num  2 7 1 1 1 4 3 1 4 10 ...
##  $ Mitoses        : num  1 1 1 1 1 1 1 1 4 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 2 1 2 2 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
table(train$Class)
## 
##    benign malignant 
##       262       149
table(test$Class)
## 
##    benign malignant 
##       182        90
full.fit = glm(Class~., family=binomial, data=train)
summary(full.fit)
## 
## Call:
## glm(formula = Class ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -3.14730  -0.13192  -0.07123   0.03253   2.54277  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -9.5627     1.3275  -7.203 5.87e-13 ***
## Cl.thickness      0.4517     0.1536   2.940  0.00328 ** 
## Cell.size        -0.1718     0.2477  -0.694  0.48799    
## Cell.shape        0.3913     0.2530   1.547  0.12192    
## Marg.adhesion     0.2863     0.1710   1.674  0.09415 .  
## Epith.c.size      0.4360     0.2243   1.944  0.05192 .  
## Bare.nuclei       0.3875     0.1246   3.110  0.00187 ** 
## Bl.cromatin       0.1930     0.2152   0.897  0.36979    
## Normal.nucleoli   0.2212     0.1562   1.416  0.15685    
## Mitoses           0.5041     0.3147   1.602  0.10915    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 538.295  on 410  degrees of freedom
## Residual deviance:  67.545  on 401  degrees of freedom
## AIC: 87.545
## 
## Number of Fisher Scoring iterations: 8
confint(full.fit)
## Waiting for profiling to be done...
##                        2.5 %     97.5 %
## (Intercept)     -12.66144554 -7.3331661
## Cl.thickness      0.16601480  0.7803406
## Cell.size        -0.64590918  0.3524918
## Cell.shape       -0.11974907  0.9028702
## Marg.adhesion    -0.03686037  0.6564497
## Epith.c.size      0.00719916  0.8771961
## Bare.nuclei       0.15129030  0.6466227
## Bl.cromatin      -0.21840836  0.6432532
## Normal.nucleoli  -0.08317520  0.5470076
## Mitoses          -0.06250576  1.0649630
exp(coef(full.fit))
##     (Intercept)    Cl.thickness       Cell.size      Cell.shape 
##    7.030617e-05    1.570954e+00    8.421386e-01    1.478874e+00 
##   Marg.adhesion    Epith.c.size     Bare.nuclei     Bl.cromatin 
##    1.331490e+00    1.546542e+00    1.473311e+00    1.212936e+00 
## Normal.nucleoli         Mitoses 
##    1.247516e+00    1.655534e+00
library(car)
vif(full.fit)
##    Cl.thickness       Cell.size      Cell.shape   Marg.adhesion 
##        1.219143        3.151518        2.324207        1.303363 
##    Epith.c.size     Bare.nuclei     Bl.cromatin Normal.nucleoli 
##        1.467540        1.435779        1.583648        1.486191 
##         Mitoses 
##        1.061130
train$probs = predict(full.fit, type="response")
train$probs[1:5]
## [1] 0.014280145 0.008574463 0.999904164 0.072183452 0.018722223
contrasts(train$Class)
##           malignant
## benign            0
## malignant         1
train$predict = rep("benign", 411)
train$predict[train$probs>0.5]="malignant"
table(train$predict, train$Class)
##            
##             benign malignant
##   benign       255         5
##   malignant      7       144
mean(train$predict==train$Class)
## [1] 0.9708029
test$prob = predict(full.fit, newdata=test, type="response")
test$predict = rep("benign", 272)
test$predict[test$prob>0.5]="malignant"
table(test$predict, test$Class)
##            
##             benign malignant
##   benign       177         6
##   malignant      5        84
mean(test$predict==test$Class)
## [1] 0.9595588
library(bestglm)
## Loading required package: leaps
train$y=rep(0,411)
train$y[train$Class=="malignant"]=1
head(train[ ,13])
## [1] 0 0 1 0 0 0
breastcancer.cv = train[ ,-10:-12]
head(breastcancer.cv)
##    Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1             5         1          1             1            2
## 3             3         1          1             1            2
## 6             8        10         10             8            7
## 7             1         1          1             1            2
## 9             2         1          1             1            2
## 10            4         2          1             1            2
##    Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses y
## 1            1           3               1       1 0
## 3            2           3               1       1 0
## 6           10           9               7       1 1
## 7           10           3               1       1 0
## 9            1           1               1       5 0
## 10           1           2               1       1 0
bestglm(Xy = breastcancer.cv, IC="CV", CVArgs=list(Method="HTF", K=10,
REP=1), family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## CV(K = 10, REP = 1)
## BICq equivalent for q in (0.000598490115906847, 0.147977113736258)
## Best Model:
##                Estimate Std. Error   z value     Pr(>|z|)
## (Intercept)  -9.1567508 1.07067064 -8.552351 1.206067e-17
## Cl.thickness  0.6919913 0.13051837  5.301869 1.146232e-07
## Epith.c.size  0.8028333 0.16009340  5.014781 5.309404e-07
## Bare.nuclei   0.6097159 0.09390343  6.493010 8.413826e-11
reduce.fit = glm(Class~Cl.thickness+Epith.c.size+Bare.nuclei, family=binomial, data=train)
train$cv.probs = predict(reduce.fit, type="response")
train$cv.predict = rep("benign", 411)
train$cv.predict[train$cv.probs>0.5]="malignant"
table(train$cv.predict, train$Class)
##            
##             benign malignant
##   benign       254         6
##   malignant      8       143
test$cv.probs = predict(reduce.fit, newdata=test, type="response")
test$predict = rep("benign", 272)
test$predict[test$cv.probs>0.5]="malignant"
table(test$predict, test$Class)
##            
##             benign malignant
##   benign       178        11
##   malignant      4        79
bestglm(Xy= breastcancer.cv, IC="BIC", family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## BIC
## BICq equivalent for q in (0.147977113736258, 0.76937387821992)
## Best Model:
##                Estimate Std. Error   z value     Pr(>|z|)
## (Intercept)  -8.7047252  1.0700450 -8.134915 4.122275e-16
## Cl.thickness  0.5001156  0.1364326  3.665662 2.466995e-04
## Cell.shape    0.4879225  0.1882773  2.591510 9.555573e-03
## Epith.c.size  0.6034520  0.1766281  3.416512 6.342890e-04
## Bare.nuclei   0.4801927  0.1039397  4.619918 3.838921e-06
bic.fit=glm(Class~Cl.thickness+Cell.shape+Epith.c.size+Bare.nuclei, family=binomial, data=train)
test$bic.probs = predict(bic.fit, newdata=test, type="response")
test$bic.predict = rep("benign", 272)
test$bic.predict[test$bic.probs>0.5]="malignant"
table(test$bic.predict, test$Class)
##            
##             benign malignant
##   benign       178         9
##   malignant      4        81