load libraries/packages that are needed:
library(caret)
library("ggplot2")
library("caTools")
library("corrplot")
library(dplyr)
library(partykit)
library("e1071")
library(reshape2)
library(pROC)
library(gridExtra)
library(grid)
library(ggfortify)
library(purrr)
library(nnet)
load in data:
cancer <- read.csv(file = "~/Downloads/CancerData.csv", header = T, stringsAsFactors = T)
str(cancer)
'data.frame': 569 obs. of 32 variables:
$ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
$ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
$ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
$ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
$ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
$ area_mean : num 1001 1326 1203 386 1297 ...
$ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
$ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
$ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
$ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
$ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
$ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
$ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
$ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
$ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
$ area_se : num 153.4 74.1 94 27.2 94.4 ...
$ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
$ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
$ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
$ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
$ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
$ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
$ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
$ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
$ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
$ area_worst : num 2019 1956 1709 568 1575 ...
$ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
$ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
$ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
$ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
$ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
$ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
percentage of women affected in benign and malignant stage
cancer %>% count(diagnosis)%>%group_by(diagnosis) %>%
summarize(perc_dx = round((n / 569)* 100, 2))
split the data into traning and testing sets
set.seed(123)
m = nrow(cancer)
index <- sample(m, 0.7 * m)
train <- cancer[index, ]
test <- cancer[-index, ]
fitControl <- trainControl(method="cv",number = 5, preProcOptions = list(thresh = 0.99),
classProbs = TRUE, summaryFunction = twoClassSummary)
MODELS:
fit.glm <- glm(diagnosis~. ,data = train, family = binomial)
Warning: glm.fit: algorithm did not convergeWarning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.glm)
Call:
glm(formula = diagnosis ~ ., family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.591e-04 -2.000e-08 -2.000e-08 2.000e-08 3.472e-04
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.832e+03 5.180e+05 -0.011 0.991
id -2.095e-08 1.228e-04 0.000 1.000
radius_mean -2.236e+03 1.273e+05 -0.018 0.986
texture_mean 2.641e+01 2.997e+03 0.009 0.993
perimeter_mean 3.004e+02 1.821e+04 0.016 0.987
area_mean 2.074e+00 4.989e+02 0.004 0.997
smoothness_mean 1.398e+04 1.232e+06 0.011 0.991
compactness_mean -1.668e+04 8.069e+05 -0.021 0.984
concavity_mean 3.519e+03 1.120e+06 0.003 0.997
concave.points_mean -3.693e+03 9.503e+05 -0.004 0.997
symmetry_mean -3.767e+03 6.776e+05 -0.006 0.996
fractal_dimension_mean 3.097e+04 3.739e+06 0.008 0.993
radius_se 1.845e+03 8.399e+05 0.002 0.998
texture_se 2.050e+02 3.057e+04 0.007 0.995
perimeter_se -1.177e+02 7.423e+04 -0.002 0.999
area_se -3.242e+00 3.928e+03 -0.001 0.999
smoothness_se 1.304e+04 3.644e+06 0.004 0.997
compactness_se 2.530e+04 1.874e+06 0.014 0.989
concavity_se -9.865e+03 7.101e+05 -0.014 0.989
concave.points_se 3.037e+04 3.865e+06 0.008 0.994
symmetry_se -1.051e+04 2.320e+06 -0.005 0.996
fractal_dimension_se -2.169e+05 1.734e+07 -0.013 0.990
radius_worst 6.174e+02 7.152e+04 0.009 0.993
texture_worst -4.770e+00 4.297e+03 -0.001 0.999
perimeter_worst -2.910e+01 1.216e+04 -0.002 0.998
area_worst -2.417e+00 3.463e+02 -0.007 0.994
smoothness_worst -2.426e+03 9.558e+05 -0.003 0.998
compactness_worst -1.587e+03 2.274e+05 -0.007 0.994
concavity_worst 9.678e+02 3.196e+05 0.003 0.998
concave.points_worst 5.978e+03 5.235e+05 0.011 0.991
symmetry_worst 3.358e+03 4.933e+05 0.007 0.995
fractal_dimension_worst 8.747e+03 1.827e+06 0.005 0.996
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5.1500e+02 on 397 degrees of freedom
Residual deviance: 1.4747e-06 on 366 degrees of freedom
AIC: 64
Number of Fisher Scoring iterations: 25
prob.glm <- predict(fit.glm,test, type="response")
pred.glm <- factor(prob.glm > .5, levels=c(FALSE, TRUE),
labels=c("benign", "malignant"))
perf.logit <- table(test$diagnosis, pred.glm,
dnn=c("Actual","Predicted"))
perf.logit
Predicted
Actual benign malignant
B 94 4
M 2 71
performance(perf.logit)
Sensitivity = 0.97
Specificity = 0.96
Positive Predictive Value = 0.95
Negative Predictive Value = 0.98
Accuracy = 0.96
fit.tree <- ctree(diagnosis~., data=train)
plot(fit.tree, main="Conditional Inference Tree")
pred.tree <- predict(fit.tree, test)
confusionMatrix(pred.tree, test$diagnosis, positive="M")
Confusion Matrix and Statistics
Reference
Prediction B M
B 93 14
M 5 59
Accuracy : 0.8889
95% CI : (0.8319, 0.9318)
No Information Rate : 0.5731
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.7693
Mcnemar's Test P-Value : 0.06646
Sensitivity : 0.8082
Specificity : 0.9490
Pos Pred Value : 0.9219
Neg Pred Value : 0.8692
Prevalence : 0.4269
Detection Rate : 0.3450
Detection Prevalence : 0.3743
Balanced Accuracy : 0.8786
'Positive' Class : M
fit.knn <- train(diagnosis~.,data = train, method="knn", metric="ROC", preProcess = c('center', 'scale'),
tuneLength=10,trControl=fitControl)
pred.knn <- predict(fit.knn,test)
cm_knn <- confusionMatrix(pred.knn, test$diagnosis, positive = "M")
cm_knn
Confusion Matrix and Statistics
Reference
Prediction B M
B 97 8
M 1 65
Accuracy : 0.9474
95% CI : (0.9024, 0.9757)
No Information Rate : 0.5731
P-Value [Acc > NIR] : <2e-16
Kappa : 0.8911
Mcnemar's Test P-Value : 0.0455
Sensitivity : 0.8904
Specificity : 0.9898
Pos Pred Value : 0.9848
Neg Pred Value : 0.9238
Prevalence : 0.4269
Detection Rate : 0.3801
Detection Prevalence : 0.3860
Balanced Accuracy : 0.9401
'Positive' Class : M
Evaluate Models
performance <- function(table, n=2){
tn = table[1,1]
fp = table[1,2]
fn = table[2,1]
tp = table[2,2]
sensitivity = tp/(tp+fn) # recall
specificity = tn/(tn+fp)
ppp = tp/(tp+fp) # precision
npp = tn/(tn+fn)
hitrate = (tp+tn)/(tp+tn+fp+fn) # accuracy
result <- paste("Sensitivity = ", round(sensitivity, n) ,
"\nSpecificity = ", round(specificity, n),
"\nPositive Predictive Value = ", round(ppp, n),
"\nNegative Predictive Value = ", round(npp, n),
"\nAccuracy = ", round(hitrate, n), "\n", sep="")
cat(result)
}