# Set seed and load packages
set.seed(12345)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(reshape)
library(caret)
library(caretEnsemble)
library(caTools)
library(data.table)
# Read data
df <- read.csv("cancer.csv")
There does not seem to be missing values in the variables, and they are all properly identified. There are two columns that are not of any use since one is an id column and one is a column of all nulls. Also this data set takes the main ten numeric characteristics and gives three different data points on them: mean, se, and worst. Because of this, I think it would be useful to sort the columns alphabetically, with the target variable diagnosis being at the end.
# Show structure
str(df)
## 'data.frame': 569 obs. of 33 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ X : logi NA NA NA NA NA NA ...
# Remove id and X columns since they are not useful
df$id <- NULL
df$X <- NULL
# Sort rows alphabetically and put target at the end
df <- df[, order(names(df))]
df <- df[, c(1:12, 14:31, 13)]
Looking at the data, there are more cases of benign than malignant breast mass, so upsampling might be useful for the models. Also, looking at the boxplots of the scaled data shows that most of the attributes are skewed upwards and have outliers in the upper tail, and the area and radius have less variation, especially compared to concavity and smoothness. The plots that show the variables split by malignant and benign reveal that symmetry and fractal dimension are features that are not very good at separating the classes, but the other features like area, concavity, and radius all separate the data quite well. Finally, plotting the correlation between the variables shows that there is in fact a high amount of correlation between some variables, which is not surprising since we have the mean, standard error, and worst case of the values, which should all be correlated since they all measures of spread of the same attributes. Principal Component Analysis would be a good way to reduce complexity and deal with the highly correlated variables.
# Show summary of data
summary(df)
## area_mean area_se area_worst compactness_mean
## Min. : 143.5 Min. : 6.802 Min. : 185.2 Min. :0.01938
## 1st Qu.: 420.3 1st Qu.: 17.850 1st Qu.: 515.3 1st Qu.:0.06492
## Median : 551.1 Median : 24.530 Median : 686.5 Median :0.09263
## Mean : 654.9 Mean : 40.337 Mean : 880.6 Mean :0.10434
## 3rd Qu.: 782.7 3rd Qu.: 45.190 3rd Qu.:1084.0 3rd Qu.:0.13040
## Max. :2501.0 Max. :542.200 Max. :4254.0 Max. :0.34540
## compactness_se compactness_worst concave.points_mean
## Min. :0.002252 Min. :0.02729 Min. :0.00000
## 1st Qu.:0.013080 1st Qu.:0.14720 1st Qu.:0.02031
## Median :0.020450 Median :0.21190 Median :0.03350
## Mean :0.025478 Mean :0.25427 Mean :0.04892
## 3rd Qu.:0.032450 3rd Qu.:0.33910 3rd Qu.:0.07400
## Max. :0.135400 Max. :1.05800 Max. :0.20120
## concave.points_se concave.points_worst concavity_mean
## Min. :0.000000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.007638 1st Qu.:0.06493 1st Qu.:0.02956
## Median :0.010930 Median :0.09993 Median :0.06154
## Mean :0.011796 Mean :0.11461 Mean :0.08880
## 3rd Qu.:0.014710 3rd Qu.:0.16140 3rd Qu.:0.13070
## Max. :0.052790 Max. :0.29100 Max. :0.42680
## concavity_se concavity_worst fractal_dimension_mean
## Min. :0.00000 Min. :0.0000 Min. :0.04996
## 1st Qu.:0.01509 1st Qu.:0.1145 1st Qu.:0.05770
## Median :0.02589 Median :0.2267 Median :0.06154
## Mean :0.03189 Mean :0.2722 Mean :0.06280
## 3rd Qu.:0.04205 3rd Qu.:0.3829 3rd Qu.:0.06612
## Max. :0.39600 Max. :1.2520 Max. :0.09744
## fractal_dimension_se fractal_dimension_worst perimeter_mean
## Min. :0.0008948 Min. :0.05504 Min. : 43.79
## 1st Qu.:0.0022480 1st Qu.:0.07146 1st Qu.: 75.17
## Median :0.0031870 Median :0.08004 Median : 86.24
## Mean :0.0037949 Mean :0.08395 Mean : 91.97
## 3rd Qu.:0.0045580 3rd Qu.:0.09208 3rd Qu.:104.10
## Max. :0.0298400 Max. :0.20750 Max. :188.50
## perimeter_se perimeter_worst radius_mean radius_se
## Min. : 0.757 Min. : 50.41 Min. : 6.981 Min. :0.1115
## 1st Qu.: 1.606 1st Qu.: 84.11 1st Qu.:11.700 1st Qu.:0.2324
## Median : 2.287 Median : 97.66 Median :13.370 Median :0.3242
## Mean : 2.866 Mean :107.26 Mean :14.127 Mean :0.4052
## 3rd Qu.: 3.357 3rd Qu.:125.40 3rd Qu.:15.780 3rd Qu.:0.4789
## Max. :21.980 Max. :251.20 Max. :28.110 Max. :2.8730
## radius_worst smoothness_mean smoothness_se smoothness_worst
## Min. : 7.93 Min. :0.05263 Min. :0.001713 Min. :0.07117
## 1st Qu.:13.01 1st Qu.:0.08637 1st Qu.:0.005169 1st Qu.:0.11660
## Median :14.97 Median :0.09587 Median :0.006380 Median :0.13130
## Mean :16.27 Mean :0.09636 Mean :0.007041 Mean :0.13237
## 3rd Qu.:18.79 3rd Qu.:0.10530 3rd Qu.:0.008146 3rd Qu.:0.14600
## Max. :36.04 Max. :0.16340 Max. :0.031130 Max. :0.22260
## symmetry_mean symmetry_se symmetry_worst texture_mean
## Min. :0.1060 Min. :0.007882 Min. :0.1565 Min. : 9.71
## 1st Qu.:0.1619 1st Qu.:0.015160 1st Qu.:0.2504 1st Qu.:16.17
## Median :0.1792 Median :0.018730 Median :0.2822 Median :18.84
## Mean :0.1812 Mean :0.020542 Mean :0.2901 Mean :19.29
## 3rd Qu.:0.1957 3rd Qu.:0.023480 3rd Qu.:0.3179 3rd Qu.:21.80
## Max. :0.3040 Max. :0.078950 Max. :0.6638 Max. :39.28
## texture_se texture_worst diagnosis
## Min. :0.3602 Min. :12.02 B:357
## 1st Qu.:0.8339 1st Qu.:21.08 M:212
## Median :1.1080 Median :25.41
## Mean :1.2169 Mean :25.68
## 3rd Qu.:1.4740 3rd Qu.:29.72
## Max. :4.8850 Max. :49.54
# Compare distribution of target variable
plot(df$diagnosis)
# Boxplot of scaled values
df_box <- df[,c("area_mean", "compactness_mean", "concave.points_mean", "concavity_mean", "fractal_dimension_mean", "perimeter_mean", "radius_mean", "smoothness_mean", "symmetry_mean", "texture_mean")]
colnames(df_box) <- c("area", "compact", "concave.pts", "concavity", "fractal_dim", "perim", "radius", "smoothness", "symmetry", "texture")
ggplot(stack(data.frame(scale(df_box))), aes(x = ind, y = values)) +
geom_boxplot()
# Plot differences between malignant and benign
g1 <- ggplot(df, aes(x=area_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
g2 <- ggplot(df, aes(x=compactness_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
g3 <- ggplot(df, aes(x=concave.points_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
g4 <- ggplot(df, aes(x=concavity_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
g5 <- ggplot(df, aes(x=fractal_dimension_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
g6 <- ggplot(df, aes(x=perimeter_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
g7 <- ggplot(df, aes(x=radius_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
g8 <- ggplot(df, aes(x=smoothness_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
g9 <- ggplot(df, aes(x=symmetry_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
g10 <- ggplot(df, aes(x=texture_mean, fill=df$diagnosis)) + geom_density(alpha=0.25)
grid.arrange(g1,g2,g3,g4,g5,g6,g7,g8,g9,g10, ncol=2)
# Plot correlation matrix
corrplot(cor(df[,-31]), method="circle")
# Split data into train set and test set
index <- sample(1:nrow(df), 0.7 * nrow(df))
train <- df[index,]
test <- df[-index,]
# This makes sure that malignant is positive and benign is negative
df$diagnosis <- factor(df$diagnosis, levels = c("M", "B"))
train$diagnosis <- factor(train$diagnosis, levels = c("M", "B"))
test$diagnosis <- factor(test$diagnosis, levels = c("M", "B"))
# Compare distribution of train and test data
DistributionCompare <- cbind(prop.table(table(train$diagnosis)), prop.table(table(test$diagnosis)))
colnames(DistributionCompare) <- c("Train", "Test")
meltedDComp <- melt(DistributionCompare)
ggplot(meltedDComp, aes(x=Var1, y=value)) + geom_bar(aes(fill=Var2), stat = "identity", position = "dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Diagnosis") + ylab("Percent") + labs(fill="")
econtrol <- trainControl(method="cv", number=10, summaryFunction = twoClassSummary,
savePredictions = TRUE, classProbs = TRUE, sampling = "up")
models <- caretList(diagnosis ~., data=train,
methodList=c("svmPoly", "nnet", "gbm", "xgbTree", "knn", "glm", "rf", "C5.0",
"nb", "rpart", "xgbLinear", "glmnet"),
preProcess = c("center", "scale", "nzv","corr", "pca"),
trControl = econtrol,
metric = "ROC"
)
results <- resamples(models)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: svmPoly, nnet, gbm, xgbTree, knn, glm, rf, C5.0, nb, rpart, xgbLinear, glmnet
## Number of resamples: 10
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svmPoly 0.9885714 0.9925714 0.9973846 0.9958960 1.0000000 1.0000000 0
## nnet 0.9546667 0.9894286 0.9945714 0.9897626 0.9973626 1.0000000 0
## gbm 0.9786667 0.9840000 0.9885714 0.9895004 0.9958571 1.0000000 0
## xgbTree 0.9680000 0.9903077 0.9942857 0.9924982 1.0000000 1.0000000 0
## knn 0.9397436 0.9563333 0.9862381 0.9746520 0.9900000 0.9948718 0
## glm 0.9520000 0.9893333 0.9942857 0.9878212 1.0000000 1.0000000 0
## rf 0.9733333 0.9819103 0.9890476 0.9882103 0.9964286 1.0000000 0
## C5.0 0.9706667 0.9783700 0.9862857 0.9853392 0.9914286 0.9974359 0
## nb 0.9013333 0.9500000 0.9671429 0.9626549 0.9870513 0.9914286 0
## rpart 0.8426667 0.8630714 0.9086447 0.9014872 0.9311429 0.9628205 0
## xgbLinear 0.9573333 0.9824359 0.9942857 0.9890418 0.9974103 1.0000000 0
## glmnet 0.9573333 0.9851429 0.9944762 0.9882667 0.9992857 1.0000000 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svmPoly 0.8666667 0.9297619 0.9333333 0.9523810 1.0000000 1.0000000 0
## nnet 0.8571429 0.9297619 0.9333333 0.9447619 0.9833333 1.0000000 0
## gbm 0.8000000 0.8595238 0.9285714 0.9047619 0.9333333 1.0000000 0
## xgbTree 0.7333333 0.9285714 0.9333333 0.9319048 0.9833333 1.0000000 0
## knn 0.8000000 0.8821429 0.9285714 0.9047619 0.9321429 0.9333333 0
## glm 0.9285714 0.9333333 0.9666667 0.9657143 1.0000000 1.0000000 0
## rf 0.7333333 0.9285714 0.9309524 0.9119048 0.9333333 1.0000000 0
## C5.0 0.8666667 0.9285714 0.9285714 0.9180952 0.9333333 0.9333333 0
## nb 0.8000000 0.8142857 0.8571429 0.8566667 0.8666667 0.9333333 0
## rpart 0.7333333 0.8000000 0.8571429 0.8428571 0.8666667 0.9333333 0
## xgbLinear 0.8000000 0.9285714 0.9309524 0.9319048 0.9833333 1.0000000 0
## glmnet 0.9285714 0.9297619 0.9333333 0.9519048 0.9833333 1.0000000 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svmPoly 0.84 0.97 1.0000000 0.9640000 1.0000000 1.0000000 0
## nnet 0.88 0.96 0.9607692 0.9681538 1.0000000 1.0000000 0
## gbm 0.84 0.96 0.9800000 0.9640000 1.0000000 1.0000000 0
## xgbTree 0.88 0.93 0.9800000 0.9640000 1.0000000 1.0000000 0
## knn 0.88 0.93 0.9600000 0.9403077 0.9600000 0.9615385 0
## glm 0.84 0.97 1.0000000 0.9720000 1.0000000 1.0000000 0
## rf 0.88 0.96 0.9600000 0.9600000 0.9600000 1.0000000 0
## C5.0 0.88 0.92 0.9400000 0.9400000 0.9600000 1.0000000 0
## nb 0.84 0.93 0.9600000 0.9441538 0.9611538 1.0000000 0
## rpart 0.76 0.88 0.9200000 0.9003077 0.9223077 1.0000000 0
## xgbLinear 0.92 0.93 0.9600000 0.9601538 0.9903846 1.0000000 0
## glmnet 0.84 0.96 1.0000000 0.9720000 1.0000000 1.0000000 0
dotplot(results)
mcr <-modelCor(results)
mcr
## svmPoly nnet gbm xgbTree knn
## svmPoly 1.00000000 0.35200457 0.71349395 0.39096029 0.1436214
## nnet 0.35200457 1.00000000 0.49903884 -0.02133438 0.2116555
## gbm 0.71349395 0.49903884 1.00000000 0.46211987 0.4315751
## xgbTree 0.39096029 -0.02133438 0.46211987 1.00000000 0.4260060
## knn 0.14362140 0.21165553 0.43157510 0.42600599 1.0000000
## glm 0.36547062 0.59943197 0.72512216 0.16963070 0.6945091
## rf 0.34050410 0.63011416 0.75833164 0.61036992 0.6688218
## C5.0 0.21301613 0.19573138 0.43242495 0.43003711 0.2616853
## nb 0.34072723 0.35165863 0.29505330 0.51894674 0.3980565
## rpart -0.09354103 -0.02053281 0.03058332 0.19777178 0.3599007
## xgbLinear 0.34365750 -0.22250348 0.02893271 0.62337881 0.2868449
## glmnet 0.43554349 0.59908246 0.71276428 0.27464255 0.7957497
## glm rf C5.0 nb rpart
## svmPoly 0.36547062 0.3405041 0.2130161 0.34072723 -0.09354103
## nnet 0.59943197 0.6301142 0.1957314 0.35165863 -0.02053281
## gbm 0.72512216 0.7583316 0.4324249 0.29505330 0.03058332
## xgbTree 0.16963070 0.6103699 0.4300371 0.51894674 0.19777178
## knn 0.69450913 0.6688218 0.2616853 0.39805646 0.35990067
## glm 1.00000000 0.7510162 0.2872837 0.08432336 -0.02593499
## rf 0.75101619 1.0000000 0.5832796 0.53589860 0.28063538
## C5.0 0.28728373 0.5832796 1.0000000 0.60146984 0.75383714
## nb 0.08432336 0.5358986 0.6014698 1.00000000 0.69999593
## rpart -0.02593499 0.2806354 0.7538371 0.69999593 1.00000000
## xgbLinear -0.23577041 0.1160811 0.2819137 0.75367256 0.46130759
## glmnet 0.97498429 0.7815028 0.2914645 0.23594236 0.05280618
## xgbLinear glmnet
## svmPoly 0.34365750 0.43554349
## nnet -0.22250348 0.59908246
## gbm 0.02893271 0.71276428
## xgbTree 0.62337881 0.27464255
## knn 0.28684487 0.79574971
## glm -0.23577041 0.97498429
## rf 0.11608113 0.78150278
## C5.0 0.28191368 0.29146451
## nb 0.75367256 0.23594236
## rpart 0.46130759 0.05280618
## xgbLinear 1.00000000 -0.04760364
## glmnet -0.04760364 1.00000000
splom(results)
stack <- caretStack(models, method="xgbTree", metric="Sens", verbose = FALSE,
trControl = trainControl(method="boot", number=15, savePredictions="final",
classProbs=TRUE, summaryFunction=twoClassSummary)
)
test$nnet <- predict(models$nnet, test)
test$svm <- predict(models$svmPoly, test)
pred <- predict(stack, test, type="prob")
threshold <- 0.5
test$stack <- ifelse(pred>threshold, "M", "B")
test$stack <- factor(test$stack, levels = c("M", "B"))
cmstackpreds <- confusionMatrix(test$nnet, test$diagnosis, positive="M", mode="everything")
cmstackpreds
## Confusion Matrix and Statistics
##
## Reference
## Prediction M B
## M 60 7
## B 6 98
##
## Accuracy : 0.924
## 95% CI : (0.8735, 0.9589)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8401
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9091
## Specificity : 0.9333
## Pos Pred Value : 0.8955
## Neg Pred Value : 0.9423
## Precision : 0.8955
## Recall : 0.9091
## F1 : 0.9023
## Prevalence : 0.3860
## Detection Rate : 0.3509
## Detection Prevalence : 0.3918
## Balanced Accuracy : 0.9212
##
## 'Positive' Class : M
##
cmstackpreds <- confusionMatrix(test$svm, test$diagnosis, positive="M", mode="everything")
cmstackpreds
## Confusion Matrix and Statistics
##
## Reference
## Prediction M B
## M 63 7
## B 3 98
##
## Accuracy : 0.9415
## 95% CI : (0.8951, 0.9716)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.878
## Mcnemar's Test P-Value : 0.3428
##
## Sensitivity : 0.9545
## Specificity : 0.9333
## Pos Pred Value : 0.9000
## Neg Pred Value : 0.9703
## Precision : 0.9000
## Recall : 0.9545
## F1 : 0.9265
## Prevalence : 0.3860
## Detection Rate : 0.3684
## Detection Prevalence : 0.4094
## Balanced Accuracy : 0.9439
##
## 'Positive' Class : M
##
cmstackpreds <- confusionMatrix(test$stack, test$diagnosis, positive="M", mode="everything")
cmstackpreds
## Confusion Matrix and Statistics
##
## Reference
## Prediction M B
## M 66 7
## B 0 98
##
## Accuracy : 0.9591
## 95% CI : (0.9175, 0.9834)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9153
## Mcnemar's Test P-Value : 0.02334
##
## Sensitivity : 1.0000
## Specificity : 0.9333
## Pos Pred Value : 0.9041
## Neg Pred Value : 1.0000
## Precision : 0.9041
## Recall : 1.0000
## F1 : 0.9496
## Prevalence : 0.3860
## Detection Rate : 0.3860
## Detection Prevalence : 0.4269
## Balanced Accuracy : 0.9667
##
## 'Positive' Class : M
##