Data Science: R

##importing the data set
library(mlbench)
data(BreastCancer)

Frequency histogram for cell.size

##creating a histogram for cell.size
BreastCancer$Cell.size <- as.numeric(BreastCancer$Cell.size) #converting factor to numeric
hist(BreastCancer$Cell.size)

Conclusion: Based on the histogram, most of the size of the cells are below 2.

Scatter plot for cell.size and normal.nuceloi

##massaging the data
unique(BreastCancer$Id)
unique(BreastCancer$Cl.thickness)
unique(BreastCancer$Cell.size)
unique(BreastCancer$Marg.adhesion)
unique(BreastCancer$Epith.c.size)
unique(BreastCancer$Bare.nuclei)
unique(BreastCancer$Bl.cromatin)
unique(BreastCancer$Normal.nucleoli)
unique(BreastCancer$Mitoses)
unique(BreastCancer$Class)

##converting character to numeric
BreastCancer$Id <- as.numeric(BreastCancer$Id)
BreastCancer$Cl.thickness <- as.numeric(BreastCancer$Cl.thickness)
BreastCancer$Cell.shape <- as.numeric(BreastCancer$Cell.shape)
BreastCancer$Cell.size <- as.numeric(BreastCancer$Cell.size)
BreastCancer$Marg.adhesion <- as.numeric(BreastCancer$Marg.adhesion)
BreastCancer$Epith.c.size <- as.numeric(BreastCancer$Epith.c.size)
BreastCancer$Bare.nuclei <- as.numeric(BreastCancer$Bare.nuclei)
BreastCancer$Bl.cromatin <- as.numeric(BreastCancer$Bl.cromatin)
BreastCancer$Normal.nucleoli <- as.numeric(BreastCancer$Normal.nucleoli)
BreastCancer$Mitoses <- as.numeric(BreastCancer$Mitoses)
BreastCancer$binary <- as.numeric(BreastCancer$Class)-1


#fixing the bare.nuclei and massaging the benign and malignant
BreastCancer <- BreastCancer[!is.na(BreastCancer$Bare.nuclei),]

##creating scatter plot for cell.size and normal.nucleoi
library(ggplot2)
ggplot(data=BreastCancer, aes(x=Cell.size, y=Normal.nucleoli, color=Class)) + 
  geom_jitter()

Conclusion: Based on the scatter plot, as the cell size and normal nucleoli increases the more cells are malignant. On the other hand, as the cell size and normal nucleoli decreases the cells become benign.

Calculating the gini index for Cell thickness

##calculating the gini index for Cl. thickness
##subsetting from 22:33
thickness <- BreastCancer[c(22:33),c("binary", "Cl.thickness")]

thickness$class_thick <- as.numeric(thickness[,2]>=5)
thickness$class_thin <- as.numeric(thickness[,2]<5)
thickness$cancerous <- as.numeric(thickness$binary==1 & thickness$class_thick==1)
thickness$noncancerous <- as.numeric(thickness$binary==1 & thickness$class_thin==1)

##creating a data frame
gini <- c()
gini$malignant <- c(sum(thickness$noncancerous),sum(thickness$cancerous))
gini$total <- c(sum(thickness$class_thin), sum(thickness$class_thick))
gini$benign <- gini$total - gini$malignant
gini <- as.data.frame(gini)
rownames(gini) <- c("greater5", "less5")

#calculating the gini index
gini$each <- 1-(gini$malignant/gini$total)^2-(gini$benign/gini$total)^2
gini_index <- gini$each*gini$total/sum(gini$total)

for cell shape

##calculating the gini index for Cell shape 4
shape <- BreastCancer[c(22:33),c("binary", "Cell.shape")]
shape$big <- as.numeric(shape[,2]>=4)
shape$small <- as.numeric(shape[,2]<4)
shape$cancerous <- as.numeric(shape$binary==1 & shape$big==1)
shape$noncancerous <- as.numeric(shape$binary==1 & shape$small==1)

##creating a data frame
gini2 <- c()
gini2$malignant <- c(sum(shape$noncancerous),sum(shape$cancerous))
gini2$total <- c(sum(shape$small), sum(shape$big))
gini2$benign <- gini2$total - gini2$malignant
gini2 <- as.data.frame(gini2)
rownames(gini2) <- c("greater4", "less4")

#calculating the gini index
gini2$each <- 1-(gini2$malignant/gini2$total)^2-(gini2$benign/gini2$total)^2
gini2_index <- gini2$each*gini2$total/sum(gini2$total)

Conclusion: The gini index for Cell thickness and cell shape are 0.13 and 0.15 respectively. With this, cell thickness is more significant and will be the first in the gini tree since it has the higher splitting power.

Building a logistic model

##moving into predictive statistics (machine learning)
index <- sample(1:nrow(BreastCancer), size=0.8*nrow(BreastCancer))
BreastCancer_train <- BreastCancer[index,]
BreastCancer_test <- BreastCancer[-index,]

##logistic regression modeling 
my_logit <- glm(binary~Cl.thickness+Cell.size+Cell.shape+Marg.adhesion+Epith.c.size+Bare.nuclei+Bl.cromatin+Normal.nucleoli+Mitoses,
                data=BreastCancer_train, family="binomial")
summary(my_logit)

## 
## Call:
## glm(formula = binary ~ Cl.thickness + Cell.size + Cell.shape + 
##     Marg.adhesion + Epith.c.size + Bare.nuclei + Bl.cromatin + 
##     Normal.nucleoli + Mitoses, family = "binomial", data = BreastCancer_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4641  -0.1183  -0.0612   0.0317   2.3657  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -9.87332    1.24335  -7.941 2.01e-15 ***
## Cl.thickness     0.54640    0.15295   3.572 0.000354 ***
## Cell.size       -0.05356    0.20626  -0.260 0.795124    
## Cell.shape       0.29504    0.22600   1.305 0.191730    
## Marg.adhesion    0.42973    0.14599   2.944 0.003244 ** 
## Epith.c.size     0.11071    0.16322   0.678 0.497601    
## Bare.nuclei      0.36856    0.09776   3.770 0.000163 ***
## Bl.cromatin      0.42736    0.17538   2.437 0.014817 *  
## Normal.nucleoli  0.19068    0.11410   1.671 0.094681 .  
## Mitoses          0.42266    0.37546   1.126 0.260292    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 719.56  on 545  degrees of freedom
## Residual deviance:  90.84  on 536  degrees of freedom
## AIC: 110.84
## 
## Number of Fisher Scoring iterations: 8

Conclusion: The most significant variables based on their p-values are cell thickness, marg. adhesion, bare nuclei and mitoses.

##how good is my model?
library(caret)

## Loading required package: lattice

my_prediction_testing <- predict(my_logit, BreastCancer_test, type = "response")

confusionMatrix(data=as.factor(as.numeric(my_prediction_testing>0.5)),
                reference = as.factor(as.numeric(BreastCancer_test$binary)))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 99  2
##          1  1 35
##                                           
##                Accuracy : 0.9781          
##                  95% CI : (0.9373, 0.9955)
##     No Information Rate : 0.7299          
##     P-Value [Acc > NIR] : 4.194e-15       
##                                           
##                   Kappa : 0.944           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9900          
##             Specificity : 0.9459          
##          Pos Pred Value : 0.9802          
##          Neg Pred Value : 0.9722          
##              Prevalence : 0.7299          
##          Detection Rate : 0.7226          
##    Detection Prevalence : 0.7372          
##       Balanced Accuracy : 0.9680          
##                                           
##        'Positive' Class : 0               
##

##building a confusion matrix for training 
my_prediction_training <- predict(my_logit, BreastCancer_train, type = "response")

confusionMatrix(data=as.factor(as.numeric(my_prediction_training>0.5)),
                reference = as.factor(as.numeric(BreastCancer_train$binary)))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 335   8
##          1   9 194
##                                           
##                Accuracy : 0.9689          
##                  95% CI : (0.9506, 0.9818)
##     No Information Rate : 0.63            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9333          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9738          
##             Specificity : 0.9604          
##          Pos Pred Value : 0.9767          
##          Neg Pred Value : 0.9557          
##              Prevalence : 0.6300          
##          Detection Rate : 0.6136          
##    Detection Prevalence : 0.6282          
##       Balanced Accuracy : 0.9671          
##                                           
##        'Positive' Class : 0               
##

Conclusion: The model is good since the accuracy are consistent with the testing and training data which are both 97%. For the testing and training data,both have higher number of true positive and true negative values as compared to false positive and false negative values.

Data Science: R

Jinina Rei Garcia

11/21/2021