##importing the data set
library(mlbench)
data(BreastCancer)
##creating a histogram for cell.size
BreastCancer$Cell.size <- as.numeric(BreastCancer$Cell.size) #converting factor to numeric
hist(BreastCancer$Cell.size)
Conclusion: Based on the histogram, most of the size of the cells are below 2.
##massaging the data
unique(BreastCancer$Id)
unique(BreastCancer$Cl.thickness)
unique(BreastCancer$Cell.size)
unique(BreastCancer$Marg.adhesion)
unique(BreastCancer$Epith.c.size)
unique(BreastCancer$Bare.nuclei)
unique(BreastCancer$Bl.cromatin)
unique(BreastCancer$Normal.nucleoli)
unique(BreastCancer$Mitoses)
unique(BreastCancer$Class)
##converting character to numeric
BreastCancer$Id <- as.numeric(BreastCancer$Id)
BreastCancer$Cl.thickness <- as.numeric(BreastCancer$Cl.thickness)
BreastCancer$Cell.shape <- as.numeric(BreastCancer$Cell.shape)
BreastCancer$Cell.size <- as.numeric(BreastCancer$Cell.size)
BreastCancer$Marg.adhesion <- as.numeric(BreastCancer$Marg.adhesion)
BreastCancer$Epith.c.size <- as.numeric(BreastCancer$Epith.c.size)
BreastCancer$Bare.nuclei <- as.numeric(BreastCancer$Bare.nuclei)
BreastCancer$Bl.cromatin <- as.numeric(BreastCancer$Bl.cromatin)
BreastCancer$Normal.nucleoli <- as.numeric(BreastCancer$Normal.nucleoli)
BreastCancer$Mitoses <- as.numeric(BreastCancer$Mitoses)
BreastCancer$binary <- as.numeric(BreastCancer$Class)-1
#fixing the bare.nuclei and massaging the benign and malignant
BreastCancer <- BreastCancer[!is.na(BreastCancer$Bare.nuclei),]
##creating scatter plot for cell.size and normal.nucleoi
library(ggplot2)
ggplot(data=BreastCancer, aes(x=Cell.size, y=Normal.nucleoli, color=Class)) +
geom_jitter()
Conclusion: Based on the scatter plot, as the cell size and normal nucleoli increases the more cells are malignant. On the other hand, as the cell size and normal nucleoli decreases the cells become benign.
##calculating the gini index for Cl. thickness
##subsetting from 22:33
thickness <- BreastCancer[c(22:33),c("binary", "Cl.thickness")]
thickness$class_thick <- as.numeric(thickness[,2]>=5)
thickness$class_thin <- as.numeric(thickness[,2]<5)
thickness$cancerous <- as.numeric(thickness$binary==1 & thickness$class_thick==1)
thickness$noncancerous <- as.numeric(thickness$binary==1 & thickness$class_thin==1)
##creating a data frame
gini <- c()
gini$malignant <- c(sum(thickness$noncancerous),sum(thickness$cancerous))
gini$total <- c(sum(thickness$class_thin), sum(thickness$class_thick))
gini$benign <- gini$total - gini$malignant
gini <- as.data.frame(gini)
rownames(gini) <- c("greater5", "less5")
#calculating the gini index
gini$each <- 1-(gini$malignant/gini$total)^2-(gini$benign/gini$total)^2
gini_index <- gini$each*gini$total/sum(gini$total)
for cell shape
##calculating the gini index for Cell shape 4
shape <- BreastCancer[c(22:33),c("binary", "Cell.shape")]
shape$big <- as.numeric(shape[,2]>=4)
shape$small <- as.numeric(shape[,2]<4)
shape$cancerous <- as.numeric(shape$binary==1 & shape$big==1)
shape$noncancerous <- as.numeric(shape$binary==1 & shape$small==1)
##creating a data frame
gini2 <- c()
gini2$malignant <- c(sum(shape$noncancerous),sum(shape$cancerous))
gini2$total <- c(sum(shape$small), sum(shape$big))
gini2$benign <- gini2$total - gini2$malignant
gini2 <- as.data.frame(gini2)
rownames(gini2) <- c("greater4", "less4")
#calculating the gini index
gini2$each <- 1-(gini2$malignant/gini2$total)^2-(gini2$benign/gini2$total)^2
gini2_index <- gini2$each*gini2$total/sum(gini2$total)
Conclusion: The gini index for Cell thickness and cell shape are 0.13 and 0.15 respectively. With this, cell thickness is more significant and will be the first in the gini tree since it has the higher splitting power.
##moving into predictive statistics (machine learning)
index <- sample(1:nrow(BreastCancer), size=0.8*nrow(BreastCancer))
BreastCancer_train <- BreastCancer[index,]
BreastCancer_test <- BreastCancer[-index,]
##logistic regression modeling
my_logit <- glm(binary~Cl.thickness+Cell.size+Cell.shape+Marg.adhesion+Epith.c.size+Bare.nuclei+Bl.cromatin+Normal.nucleoli+Mitoses,
data=BreastCancer_train, family="binomial")
summary(my_logit)
##
## Call:
## glm(formula = binary ~ Cl.thickness + Cell.size + Cell.shape +
## Marg.adhesion + Epith.c.size + Bare.nuclei + Bl.cromatin +
## Normal.nucleoli + Mitoses, family = "binomial", data = BreastCancer_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4641 -0.1183 -0.0612 0.0317 2.3657
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.87332 1.24335 -7.941 2.01e-15 ***
## Cl.thickness 0.54640 0.15295 3.572 0.000354 ***
## Cell.size -0.05356 0.20626 -0.260 0.795124
## Cell.shape 0.29504 0.22600 1.305 0.191730
## Marg.adhesion 0.42973 0.14599 2.944 0.003244 **
## Epith.c.size 0.11071 0.16322 0.678 0.497601
## Bare.nuclei 0.36856 0.09776 3.770 0.000163 ***
## Bl.cromatin 0.42736 0.17538 2.437 0.014817 *
## Normal.nucleoli 0.19068 0.11410 1.671 0.094681 .
## Mitoses 0.42266 0.37546 1.126 0.260292
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 719.56 on 545 degrees of freedom
## Residual deviance: 90.84 on 536 degrees of freedom
## AIC: 110.84
##
## Number of Fisher Scoring iterations: 8
Conclusion: The most significant variables based on their p-values are cell thickness, marg. adhesion, bare nuclei and mitoses.
##how good is my model?
library(caret)
## Loading required package: lattice
my_prediction_testing <- predict(my_logit, BreastCancer_test, type = "response")
confusionMatrix(data=as.factor(as.numeric(my_prediction_testing>0.5)),
reference = as.factor(as.numeric(BreastCancer_test$binary)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 99 2
## 1 1 35
##
## Accuracy : 0.9781
## 95% CI : (0.9373, 0.9955)
## No Information Rate : 0.7299
## P-Value [Acc > NIR] : 4.194e-15
##
## Kappa : 0.944
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9900
## Specificity : 0.9459
## Pos Pred Value : 0.9802
## Neg Pred Value : 0.9722
## Prevalence : 0.7299
## Detection Rate : 0.7226
## Detection Prevalence : 0.7372
## Balanced Accuracy : 0.9680
##
## 'Positive' Class : 0
##
##building a confusion matrix for training
my_prediction_training <- predict(my_logit, BreastCancer_train, type = "response")
confusionMatrix(data=as.factor(as.numeric(my_prediction_training>0.5)),
reference = as.factor(as.numeric(BreastCancer_train$binary)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 335 8
## 1 9 194
##
## Accuracy : 0.9689
## 95% CI : (0.9506, 0.9818)
## No Information Rate : 0.63
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9333
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9738
## Specificity : 0.9604
## Pos Pred Value : 0.9767
## Neg Pred Value : 0.9557
## Prevalence : 0.6300
## Detection Rate : 0.6136
## Detection Prevalence : 0.6282
## Balanced Accuracy : 0.9671
##
## 'Positive' Class : 0
##
Conclusion: The model is good since the accuracy are consistent with the testing and training data which are both 97%. For the testing and training data,both have higher number of true positive and true negative values as compared to false positive and false negative values.