Libaries and user-defined functions

Data import

EDA

# test the propostion of two results
prop.table(table(cancer$diagnosis))
## 
##    Benign Malignant 
## 0.6274165 0.3725835
summary(cancer$diagnosis)
##    Benign Malignant 
##       357       212
# test the correlation between variables using correlation map
corr_map <- cor(cancer[,2:ncol(cancer)])
corrplot(corr_map)

From EDA, we know that the two results are unbalanced, there are 63% benign cases and 37% Maligiant cases. There is correlations between some of the variables, amoong which there are strong correlations.

Partition training data and test data

#try ~ 4:1
train_df <- cancer[1:401,]
train_labels <-train_df[,2]
ft_train <- frqtab(train_df$diagnosis)

test_df <- cancer[402:502,]
test_labels <-train_df[,2]
ft_test <- frqtab(test_df$diagnosis)

ft_orig <- frqtab(cancer$diagnosis)
ftcmp_df <- as.data.frame(cbind(ft_orig, ft_train, ft_test))
colnames(ftcmp_df) <- c("Original", "Training set", "Test set")

pander(ftcmp_df, style="rmarkdown",
             caption="Comparison of diagnosis frequencies (in %)")
Comparison of diagnosis frequencies (in %)
  Original Training set Test set
Benign 62.7 62.8 62.4
Malignant 37.3 37.2 37.6

Random Forest

rf_model = randomForest(diagnosis ~ ., 
                        data = train_df, mtry = 17, importance = TRUE)

rf_model_pred_test = predict(rf_model, test_df)

#Confusion Matrix and Statistics
cm_rf_model<- confusionMatrix(rf_model_pred_test, test_df$diagnosis, positive = "Malignant")
cm_rf_model
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        63         1
##   Malignant      0        37
##                                           
##                Accuracy : 0.9901          
##                  95% CI : (0.9461, 0.9997)
##     No Information Rate : 0.6238          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9788          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9737          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9844          
##              Prevalence : 0.3762          
##          Detection Rate : 0.3663          
##    Detection Prevalence : 0.3663          
##       Balanced Accuracy : 0.9868          
##                                           
##        'Positive' Class : Malignant       
## 

If we include all factors, an Accuracy of 0.9802, 95% CI : (0.9303, 0.9976) can be achieved. However, we know that from previous corrleation map that some of the factors can be eliminated, maybe this will yield better accuracy.

We see that radius, perimeter and area are correlated. Fractal_dimension_mean and smoothness_se seem not to negtively correlate with radius, perimeter and area.

rf_model1 = randomForest(diagnosis ~ 
            perimeter_mean + texture_mean + smoothness_mean + compactness_mean + concavity_mean + concave.points_mean + symmetry_mean +
            area_se +texture_se +compactness_se + concavity_se + concave.points_se + fractal_dimension_se +
            radius_worst + texture_worst + smoothness_worst + compactness_worst + concavity_worst + concave.points_worst + symmetry_worst + fractal_dimension_worst, 
                        data = train_df, mtry = 17, importance = TRUE)

rf_model1_pred_test = predict(rf_model1, test_df)

#Confusion Matrix and Statistics
cm_rf_model1<- confusionMatrix(rf_model1_pred_test, test_df$diagnosis, positive = "Malignant")
cm_rf_model1
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        63         2
##   Malignant      0        36
##                                           
##                Accuracy : 0.9802          
##                  95% CI : (0.9303, 0.9976)
##     No Information Rate : 0.6238          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9574          
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 0.9474          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9692          
##              Prevalence : 0.3762          
##          Detection Rate : 0.3564          
##    Detection Prevalence : 0.3564          
##       Balanced Accuracy : 0.9737          
##                                           
##        'Positive' Class : Malignant       
## 

This time, I choose only one among perimeter, radius and area, and eliminated Fractal_dimension_mean, symmetry_se and smoothness_se Now, the accuracy decreased to 0.97.

rf_model2 = randomForest(diagnosis ~ 
            perimeter_mean + texture_mean + smoothness_mean + compactness_mean + concavity_mean + concave.points_mean + symmetry_mean +
            radius_worst + texture_worst + smoothness_worst + compactness_worst + concavity_worst + concave.points_worst + symmetry_worst + fractal_dimension_worst, 
                        data = train_df, mtry = 10, importance = TRUE)

rf_model2_pred_test = predict(rf_model2, test_df)

#Confusion Matrix and Statistics
cm_rf_model2<- confusionMatrix(rf_model2_pred_test, test_df$diagnosis, positive = "Malignant")
cm_rf_model2
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        62         1
##   Malignant      1        37
##                                           
##                Accuracy : 0.9802          
##                  95% CI : (0.9303, 0.9976)
##     No Information Rate : 0.6238          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9578          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9737          
##             Specificity : 0.9841          
##          Pos Pred Value : 0.9737          
##          Neg Pred Value : 0.9841          
##              Prevalence : 0.3762          
##          Detection Rate : 0.3663          
##    Detection Prevalence : 0.3762          
##       Balanced Accuracy : 0.9789          
##                                           
##        'Positive' Class : Malignant       
## 

This time I exlcuded all the _se factors from model 1, and obtained the same accuracy as I have with all factors, 0.9802 95% CI : (0.9303, 0.9976)

rf_model3 = randomForest(diagnosis ~ 
            radius_mean + perimeter_mean + texture_mean + area_mean + smoothness_mean + compactness_mean + concavity_mean + concave.points_mean + symmetry_mean + fractal_dimension_mean +
            radius_worst + perimeter_worst + texture_worst + area_worst + smoothness_worst + compactness_worst + concavity_worst + concave.points_worst + symmetry_worst + fractal_dimension_worst,
                        data = train_df, mtry = 10, importance = TRUE)

rf_model3_pred_test = predict(rf_model3, test_df)

#Confusion Matrix and Statistics
cm_rf_model3<- confusionMatrix(rf_model3_pred_test, test_df$diagnosis, positive = "Malignant")
cm_rf_model3
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        62         1
##   Malignant      1        37
##                                           
##                Accuracy : 0.9802          
##                  95% CI : (0.9303, 0.9976)
##     No Information Rate : 0.6238          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9578          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9737          
##             Specificity : 0.9841          
##          Pos Pred Value : 0.9737          
##          Neg Pred Value : 0.9841          
##              Prevalence : 0.3762          
##          Detection Rate : 0.3663          
##    Detection Prevalence : 0.3762          
##       Balanced Accuracy : 0.9789          
##                                           
##        'Positive' Class : Malignant       
## 

This time excluded all _se factors from all factors, and obtained the same accuracy as I have with all factors, 0.9802 95% CI : (0.9303, 0.9976)

I conclude, actaully, including all factors yield the best result.