#Load library
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyr)
data("BreastCancer")
head(BreastCancer, n=20)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## 7 1018099 1 1 1 1 2
## 8 1018561 2 1 2 1 2
## 9 1033078 2 1 1 1 2
## 10 1033078 4 2 1 1 2
## 11 1035283 1 1 1 1 1
## 12 1036172 2 1 1 1 2
## 13 1041801 5 3 3 3 2
## 14 1043999 1 1 1 1 2
## 15 1044572 8 7 5 10 7
## 16 1047630 7 4 6 4 6
## 17 1048672 4 1 1 1 2
## 18 1049815 4 1 1 1 2
## 19 1050670 10 7 7 6 4
## 20 1050718 6 1 1 1 2
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 benign
## 2 10 3 2 1 benign
## 3 2 3 1 1 benign
## 4 4 3 7 1 benign
## 5 1 3 1 1 benign
## 6 10 9 7 1 malignant
## 7 10 3 1 1 benign
## 8 1 3 1 1 benign
## 9 1 1 1 5 benign
## 10 1 2 1 1 benign
## 11 1 3 1 1 benign
## 12 1 2 1 1 benign
## 13 3 4 4 1 malignant
## 14 3 3 1 1 benign
## 15 9 5 5 4 malignant
## 16 1 4 3 1 malignant
## 17 1 2 1 1 benign
## 18 1 3 1 1 benign
## 19 10 4 1 2 malignant
## 20 1 3 1 1 benign
summary(BreastCancer)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion
## Length:699 1 :145 1 :384 1 :353 1 :407
## Class :character 5 :130 10 : 67 2 : 59 2 : 58
## Mode :character 3 :108 3 : 52 10 : 58 3 : 58
## 4 : 80 2 : 45 3 : 56 10 : 55
## 10 : 69 4 : 40 4 : 44 4 : 33
## 2 : 50 5 : 30 5 : 34 8 : 25
## (Other):117 (Other): 81 (Other): 95 (Other): 63
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## 2 :386 1 :402 2 :166 1 :443 1 :579
## 3 : 72 10 :132 3 :165 10 : 61 2 : 35
## 4 : 48 2 : 30 1 :152 3 : 44 3 : 33
## 1 : 47 5 : 30 7 : 73 2 : 36 10 : 14
## 6 : 41 3 : 28 4 : 40 8 : 24 4 : 12
## 5 : 39 (Other): 61 5 : 34 6 : 22 7 : 9
## (Other): 66 NA's : 16 (Other): 69 (Other): 69 (Other): 17
## Class
## benign :458
## malignant:241
##
##
##
##
##
dim(BreastCancer)
## [1] 699 11
BreastCancer <- drop_na(BreastCancer)
dim(BreastCancer)
## [1] 683 11
#spliting the dataset into train data and test data
set.seed(7)
#select 80% of the data for train data
train_data <- createDataPartition(BreastCancer$Class, p=0.80, list = FALSE)
#select 20% of the data for test data
test_data <- BreastCancer[-train_data,]
#work with the train dataset
dataset_train <- BreastCancer[train_data,]
head(dataset_train, n=20)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## 7 1018099 1 1 1 1 2
## 8 1018561 2 1 2 1 2
## 9 1033078 2 1 1 1 2
## 10 1033078 4 2 1 1 2
## 11 1035283 1 1 1 1 1
## 13 1041801 5 3 3 3 2
## 14 1043999 1 1 1 1 2
## 15 1044572 8 7 5 10 7
## 16 1047630 7 4 6 4 6
## 18 1049815 4 1 1 1 2
## 19 1050670 10 7 7 6 4
## 21 1054590 7 3 2 10 5
## 22 1054593 10 5 5 3 6
## 23 1056784 3 1 1 1 2
## 25 1059552 1 1 1 1 2
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 benign
## 2 10 3 2 1 benign
## 3 2 3 1 1 benign
## 5 1 3 1 1 benign
## 6 10 9 7 1 malignant
## 7 10 3 1 1 benign
## 8 1 3 1 1 benign
## 9 1 1 1 5 benign
## 10 1 2 1 1 benign
## 11 1 3 1 1 benign
## 13 3 4 4 1 malignant
## 14 3 3 1 1 benign
## 15 9 5 5 4 malignant
## 16 1 4 3 1 malignant
## 18 1 3 1 1 benign
## 19 10 4 1 2 malignant
## 21 10 5 4 4 malignant
## 22 7 7 10 1 malignant
## 23 1 2 1 1 benign
## 25 1 3 1 1 benign
#dimension of data
dim(dataset_train)
## [1] 548 11
#check attribute data types
sapply(dataset_train, class)
## $Id
## [1] "character"
##
## $Cl.thickness
## [1] "ordered" "factor"
##
## $Cell.size
## [1] "ordered" "factor"
##
## $Cell.shape
## [1] "ordered" "factor"
##
## $Marg.adhesion
## [1] "ordered" "factor"
##
## $Epith.c.size
## [1] "ordered" "factor"
##
## $Bare.nuclei
## [1] "factor"
##
## $Bl.cromatin
## [1] "factor"
##
## $Normal.nucleoli
## [1] "factor"
##
## $Mitoses
## [1] "factor"
##
## $Class
## [1] "factor"
#the id column is redundant. Hence, remove it
dataset_train <- dataset_train[,-1]
#convert the attributes to numeric
for (i in 1:9) {
dataset_train[,i] <- as.numeric(as.character(dataset_train[,i]))
}
dim(test_data)
## [1] 135 11
head(test_data, 5)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 4 1016277 6 8 8 1 3
## 12 1036172 2 1 1 1 2
## 17 1048672 4 1 1 1 2
## 20 1050718 6 1 1 1 2
## 29 1067444 2 1 1 1 2
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 4 4 3 7 1 benign
## 12 1 2 1 1 benign
## 17 1 2 1 1 benign
## 20 1 3 1 1 benign
## 29 1 2 1 1 benign
#the id column of the test_data is redundant. Hence, remove it
test_data <- test_data[,-1]
#convert the attributes of the test_data to numeric
for (i in 1:9) {
test_data[,i] <- as.numeric(as.character(test_data[,i]))
}
dim(test_data)
## [1] 135 10
#summary of dataset
summary(dataset_train)
## Cl.thickness Cell.size Cell.shape Marg.adhesion
## Min. : 1.00 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.00 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 4.00 Median : 1.000 Median : 1.000 Median : 1.000
## Mean : 4.54 Mean : 3.162 Mean : 3.241 Mean : 2.823
## 3rd Qu.: 6.00 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.00 Max. :10.000 Max. :10.000 Max. :10.000
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.00 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.00 1st Qu.: 1.000
## Median : 2.000 Median : 1.000 Median : 3.00 Median : 1.000
## Mean : 3.272 Mean : 3.555 Mean : 3.44 Mean : 2.834
## 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.00 3rd Qu.: 3.000
## Max. :10.000 Max. :10.000 Max. :10.00 Max. :10.000
## Mitoses Class
## Min. : 1.000 benign :356
## 1st Qu.: 1.000 malignant:192
## Median : 1.000
## Mean : 1.617
## 3rd Qu.: 1.000
## Max. :10.000
#create a frequency distribution table on the categorical attribute
cbind(freq=table(dataset_train$Class), percentage=prop.table(table(dataset_train$Class))*100)
## freq percentage
## benign 356 64.9635
## malignant 192 35.0365
The distribution suggest that the dataset is imbalanced.
#summarize the correlations between the variables excluding the NA's
Comp_cases <- complete.cases(dataset_train)
cor(dataset_train[Comp_cases,1:9])
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## Cl.thickness 1.0000000 0.6522113 0.6710172 0.4962448 0.5297073
## Cell.size 0.6522113 1.0000000 0.9111170 0.7035362 0.7618625
## Cell.shape 0.6710172 0.9111170 1.0000000 0.6789790 0.7227183
## Marg.adhesion 0.4962448 0.7035362 0.6789790 1.0000000 0.5981889
## Epith.c.size 0.5297073 0.7618625 0.7227183 0.5981889 1.0000000
## Bare.nuclei 0.5972304 0.6990317 0.7206004 0.6647458 0.6001904
## Bl.cromatin 0.5705749 0.7574462 0.7315276 0.6584887 0.6189146
## Normal.nucleoli 0.5520499 0.7037695 0.7203971 0.5973399 0.6272372
## Mitoses 0.3590308 0.4687319 0.4395876 0.4187559 0.4529561
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## Cl.thickness 0.5972304 0.5705749 0.5520499 0.3590308
## Cell.size 0.6990317 0.7574462 0.7037695 0.4687319
## Cell.shape 0.7206004 0.7315276 0.7203971 0.4395876
## Marg.adhesion 0.6647458 0.6584887 0.5973399 0.4187559
## Epith.c.size 0.6001904 0.6189146 0.6272372 0.4529561
## Bare.nuclei 1.0000000 0.6839590 0.5918035 0.3469319
## Bl.cromatin 0.6839590 1.0000000 0.6655715 0.3568605
## Normal.nucleoli 0.5918035 0.6655715 1.0000000 0.4379907
## Mitoses 0.3469319 0.3568605 0.4379907 1.0000000
In summary, the matrix suggests that features like cell size, shape, and chromatin are strongly interrelated, whereas features like mitoses (cell division) and bare nuclei have relatively weaker associations with the other characteristics.
#create histogram of the attributes
par(mfrow=c(3,3))
for(i in 1:9) {
hist(dataset_train[Comp_cases,i], main=names(dataset_train)[i])
}
# density plot for each attribute
par(mfrow=c(3,3))
#complete_cases <- complete.cases(dataset)
for(i in 1:9) {
plot(density(dataset_train[Comp_cases,i]), main=names(dataset_train)[i])
}
#The plots are bimodal distributions (two bumps) and exponential looking distributions.
#boxplots for each attributes
par(mfrow=c(3,3))
for(i in 1:9) {
boxplot(dataset_train[,i], main = names(dataset_train[i]))
}
#scatterplot atrix
jittered_x <- sapply(dataset_train[,1:9], jitter)
pairs(jittered_x, names(dataset_train[,1:9]), col = dataset_train$Class)
#since the attributes are discrete. create a barplot
par(mfrow=c(3,3))
for(i in 1:9) {
barplot(table(dataset_train$Class,dataset_train[,i]), main=names(dataset_train)[i],
legend.text=unique(dataset_train$Class))
}
#Evaluating the Model
#create a 10-fold cross validation with 3 repeats
trainControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
metric <- "Accuracy"
#Evalualuating the model on both linear and non-linear algorithms
# Logistic Regression (LR)
set.seed(7)
fit.glm <- train(Class~., data=dataset_train, method="glm", metric=metric, trControl=trainControl)
# Linear Discriminant Analysis (LDA)
set.seed(7)
fit.lda <- train(Class~., data=dataset_train, method="lda", metric=metric, trControl=trainControl)
# Regularized Logistic Regression (GLMNET)
set.seed(7)
fit.glmnet <- train(Class~., data=dataset_train, method="glmnet", metric=metric,
trControl=trainControl)
# K-Nearest Neighbour (KNN)
set.seed(7)
fit.knn <- train(Class~., data=dataset_train, method="knn", metric=metric, trControl=trainControl)
# Classification and Reggression Tree (CART)
set.seed(7)
fit.cart <- train(Class~., data=dataset_train, method="rpart", metric=metric,
trControl=trainControl)
# SVM
set.seed(7)
fit.svm <- train(Class~., data=dataset_train, method="svmRadial", metric=metric,
trControl=trainControl)
# Compare algorithms
results <- resamples(list(LR=fit.glm, LDA=fit.lda, GLMNET=fit.glmnet, KNN=fit.knn,
CART=fit.cart, SVM=fit.svm))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: LR, LDA, GLMNET, KNN, CART, SVM
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LR 0.8928571 0.9454545 0.9632997 0.9629137 0.9818182 1 0
## LDA 0.8888889 0.9275974 0.9636364 0.9622415 0.9820617 1 0
## GLMNET 0.8928571 0.9454545 0.9727273 0.9671569 0.9954545 1 0
## KNN 0.9107143 0.9636364 0.9818182 0.9750477 1.0000000 1 0
## CART 0.8909091 0.9145172 0.9454545 0.9410169 0.9636364 1 0
## SVM 0.9090909 0.9456981 0.9636364 0.9617136 0.9814815 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LR 0.7613636 0.8808664 0.9191938 0.9183597 0.9602888 1 0
## LDA 0.7503852 0.8396132 0.9195906 0.9154755 0.9610166 1 0
## GLMNET 0.7613636 0.8808664 0.9403590 0.9275355 0.9902827 1 0
## KNN 0.7988506 0.9195906 0.9602888 0.9450762 1.0000000 1 0
## CART 0.7526237 0.8130487 0.8816524 0.8708245 0.9195906 1 0
## SVM 0.8135593 0.8845649 0.9215407 0.9187101 0.9598811 1 0
KNN shows the highest median accuracy and mean performance, suggesting it is the most consistently high-performing model in this set. GLMNET and LDA also show very good accuracy, with their results close to KNN. CART, while performing well, shows somewhat lower median and mean accuracy compared to the others. All models have no missing data (NA’s = 0). In conclusion, KNN appears to be the most accurate model overall based on this data, followed closely by GLMNET and LDA. CART has the lowest performance in terms of median and mean accuracy.
dotplot(results)
The good accuracy can be seen across the board. All algorithms have a mean accuracy above 90%, well above the baseline of 65% that was predicted of benign. Also, Naive Bayes was 96.60%, GLMNET was 96.72% and KNN (97.50%) had the highest accuracy on the problem.
#Transforming and normalizing skewed distributions
# Logistic Regression (LR)
set.seed(7)
fit.glm <- train(Class~., data=dataset_train, method="glm", metric=metric, preProc=c("BoxCox"), trControl=trainControl)
# Linear Discriminant Analysis (LDA)
set.seed(7)
fit.lda <- train(Class~., data=dataset_train, method="lda", metric=metric, preProc=c("BoxCox"), trControl=trainControl)
# Regularized Logistic Regression (GLMNET)
set.seed(7)
fit.glmnet <- train(Class~., data=dataset_train, method="glmnet", metric=metric, preProc=c("BoxCox"),
trControl=trainControl)
# K-Nearest Neighbour (KNN)
set.seed(7)
fit.knn <- train(Class~., data=dataset_train, method="knn", metric=metric, preProc=c("BoxCox"), trControl=trainControl)
# Classification and Reggression Tree (CART)
set.seed(7)
fit.cart <- train(Class~., data=dataset_train, method="rpart", metric=metric, preProc=c("BoxCox"),
trControl=trainControl)
# SVM
set.seed(7)
fit.svm <- train(Class~., data=dataset_train, method="svmRadial", metric=metric, preProc=c("BoxCox"),
trControl=trainControl)
# Compare transformed algorithms
transformedResults <- resamples(list(LR=fit.glm, LDA=fit.lda, GLMNET=fit.glmnet, KNN=fit.knn,
CART=fit.cart, SVM=fit.svm))
summary(transformedResults)
##
## Call:
## summary.resamples(object = transformedResults)
##
## Models: LR, LDA, GLMNET, KNN, CART, SVM
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LR 0.8888889 0.9636364 0.9814815 0.9689534 0.9818182 1 0
## LDA 0.9272727 0.9631313 0.9816498 0.9738364 0.9818182 1 0
## GLMNET 0.9272727 0.9629630 0.9818182 0.9750705 0.9954545 1 0
## KNN 0.9272727 0.9636364 0.9814815 0.9744316 0.9818182 1 0
## CART 0.8909091 0.9145172 0.9454545 0.9410169 0.9636364 1 0
## SVM 0.9444444 0.9642857 0.9818182 0.9799423 1.0000000 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LR 0.7440758 0.9195906 0.9590968 0.9315514 0.9602888 1 0
## LDA 0.8430813 0.9207048 0.9598811 0.9433827 0.9609203 1 0
## GLMNET 0.8430813 0.9174940 0.9600850 0.9459062 0.9902827 1 0
## KNN 0.8430813 0.9214566 0.9595853 0.9446289 0.9602888 1 0
## CART 0.7526237 0.8130487 0.8816524 0.8708245 0.9195906 1 0
## SVM 0.8824383 0.9239130 0.9602888 0.9569363 1.0000000 1 0
The accuracy of the previous best algorithm KNN was elevated to 97.44%. While SVM now has the most accurate mean accuracy at 97.99% in the new ranking.
dotplot(transformedResults)
#Summarize the best model
print(fit.svm)
## Support Vector Machines with Radial Basis Function Kernel
##
## 548 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## Pre-processing: Box-Cox transformation (9)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 493, 492, 493, 493, 493, 493, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.9799423 0.9569363
## 0.50 0.9799423 0.9569363
## 1.00 0.9799423 0.9569363
##
## Tuning parameter 'sigma' was held constant at a value of 0.1882722
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1882722 and C = 0.25.
Model Performance: Accuracy = 0.9799423: This is a very high accuracy, indicating the model is performing exceptionally well at distinguishing between benign and malignant classes. Kappa = 0.9569363: A Kappa score of 0.96 is also very high, indicating almost perfect agreement between predicted and actual labels, adjusting for chance agreement.
In conclusion, the SVM model with an RBF kernel, using sigma = 0.1882722 and C = 0.25, is highly effective for this classification task. The cross-validation results show stable performance across different values of C, and the chosen model achieves high accuracy and Kappa, making it well-suited for classifying the two categories of the dataset.
# Since the SVM made the best model, estimate skill of SVM on the validation dataset
predictions <- predict(fit.svm, test_data)
predictions
## [1] malignant benign benign benign benign benign benign
## [8] malignant malignant benign benign benign benign malignant
## [15] benign benign benign malignant malignant malignant malignant
## [22] malignant benign benign benign benign benign benign
## [29] malignant benign benign malignant benign benign malignant
## [36] malignant benign malignant benign benign malignant malignant
## [43] malignant benign benign malignant benign malignant malignant
## [50] benign benign malignant malignant benign benign malignant
## [57] malignant malignant benign malignant malignant benign malignant
## [64] benign malignant malignant benign benign malignant malignant
## [71] malignant benign benign benign benign benign malignant
## [78] malignant benign benign benign benign benign benign
## [85] benign benign benign benign malignant malignant malignant
## [92] malignant malignant malignant benign benign benign benign
## [99] benign benign benign benign malignant benign benign
## [106] benign benign benign benign benign benign malignant
## [113] benign malignant malignant malignant benign benign benign
## [120] benign benign malignant benign benign malignant malignant
## [127] benign benign malignant benign benign benign benign
## [134] benign malignant
## Levels: benign malignant
#confusion matrix
confusionMatrix(predictions, test_data$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 83 1
## malignant 5 46
##
## Accuracy : 0.9556
## 95% CI : (0.9058, 0.9835)
## No Information Rate : 0.6519
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.904
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.9432
## Specificity : 0.9787
## Pos Pred Value : 0.9881
## Neg Pred Value : 0.9020
## Prevalence : 0.6519
## Detection Rate : 0.6148
## Detection Prevalence : 0.6222
## Balanced Accuracy : 0.9610
##
## 'Positive' Class : benign
##
True Positives (TP): 83 cases where the model correctly predicted benign. False Positives (FP): 1 case where the model incorrectly predicted benign when it was actually malignant. False Negatives (FN): 5 cases where the model incorrectly predicted malignant when it was actually benign. True Negatives (TN): 46 cases where the model correctly predicted malignant.
The model performs very well with an accuracy of 95.56%, a Kappa of 0.904 (indicating excellent agreement), and high sensitivity (94.32%) and specificity (97.87%). The positive predictive value (0.9881) for benign is very high, indicating reliable identification of benign cases. The negative predictive value is also strong, though slightly lower. Overall, the model shows excellent predictive power for distinguishing between the two classes and has a high balanced accuracy of 96.10%.
With 95.56% accuracy, the model demonstrates outstanding performance in predicting benign and malignant cases in breast cancer data. This high accuracy, combined with excellent sensitivity and specificity, suggests that the model is robust, reliable, and suitable for medical applications like cancer diagnosis.
The 95% Confidence Interval (CI) for Accuracy is (0.9058, 0.9835), suggesting that the model’s true accuracy likely falls within this range, with a high degree of certainty.
High Performance: An accuracy of 95.56% indicates that the model is highly effective at distinguishing between the benign and malignant classes in the breast cancer dataset. This is a strong result, especially for medical diagnostics, where identifying the correct class (cancerous vs. non-cancerous) is critical. The model correctly predicted the class for 95.56% of the instances, meaning it correctly identified either benign or malignant cases in most instances.