Samples were obtained from Pakastein. At first, we will import the data to R environment by using readxl R Package. Then, we will look at the Column Names. Get an curated data frame from the Raw data set.
library(readxl)
## Warning: package 'readxl' was built under R version 3.1.3
df <- read_excel("Data-7.xlsx")
names(df)
## [1] "Lab No" "Age" "Gender" "RBC" "HB"
## [6] "HCT" "MCV" "MCH" "MCHC" "RDW"
## [11] "WBC" "NEUTRO" "LYMPHO" "MONO" "EOSINO"
## [16] "BASO" "PLAT" "Microcyte" "Hypoch" "Teardrop"
## [21] "Target" "Schistocyte" "Ferritin" "Remarks"
name <- c("Remarks", "RBC", "HB", "HCT", "MCV", "MCH", "MCHC", "RDW", "Ferritin")
new_df <- df[, name]
btt <- subset(new_df, Remarks == "BTT")
ida <- subset(new_df, Remarks == "IDA")
att <- subset(new_df, Remarks == "ATT")
data <- rbind(btt, ida, att)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
ggdata <- data.frame(table(data$Remarks))
Label <- ggdata$Var1
x <- ggdata$Freq
pie(x, Label, col = c("green", "red", "blue"))
library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.3
data$Remarks <- as.factor(data$Remarks)
index <- createDataPartition(data$Remarks, times = 1, p = 0.8, list = FALSE)
Train <- data[index, ]
Test <- data[-index, ]
dim(Train)
## [1] 736 9
dim(Test)
## [1] 182 9
When we look at the dimension of the Training Set and Testing Set with dim function from Base R Program, we found that the Training sample has 736 samples while the external set has 189 samples, which will be used for validating our random forest model.
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
fit <- randomForest(Remarks ~ ., data = Train, ntree = 100, importance = TRUE)
prediction <- predict(fit, Train)
actual <- Train$Remarks
result <- data.frame(prediction, actual)
head(result, 3)
## prediction actual
## 1 BTT BTT
## 2 BTT BTT
## 4 BTT BTT
library(caret)
prediction <- predict(fit, Test)
actual <- Test$Remarks
result <- caret::confusionMatrix(prediction, actual, positive = "IDA")
print(result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ATT BTT IDA
## ATT 0 0 0
## BTT 2 98 0
## IDA 0 0 82
##
## Overall Statistics
##
## Accuracy : 0.989
## 95% CI : (0.9609, 0.9987)
## No Information Rate : 0.5385
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9781
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: ATT Class: BTT Class: IDA
## Sensitivity 0.00000 1.0000 1.0000
## Specificity 1.00000 0.9762 1.0000
## Pos Pred Value NaN 0.9800 1.0000
## Neg Pred Value 0.98901 1.0000 1.0000
## Prevalence 0.01099 0.5385 0.4505
## Detection Rate 0.00000 0.5385 0.4505
## Detection Prevalence 0.00000 0.5495 0.4505
## Balanced Accuracy 0.50000 0.9881 1.0000
library(randomForest)
set.seed(1)
varImpPlot(fit)
In Random Forest Algorithm, there are two paramters, which are MeanDecreaseAccuracy and MeanDecreaseGini. It was reported that the MeanDecreaseGini parameters is highly robust when relating the feature with outcome (i.e., BTT, blah). The feature that has a top value is the most important. From the Feature importance, we found that RDW is highly importance. Thus we want to make a histogram or barplot of this Feature. RDW is the distance between the red blood cell in the blood flow.
library(ggplot2)
df <- Train
ggplot(df, aes(x = RDW, fill = Remarks)) + geom_histogram(color = "white") +
annotate("text", x = 15, y = 50, label = "ATT, BTT") + annotate("text",
x = 22, y = 50, label = "IDA") + ylab("Frequency") + xlab("Red Cell Distribution Width") +
theme_bw() + ggtitle("Frequency Distribution of RDW")
Although, Random Forest is a great ensemble model, we would like to obtain if-else statements which will allow use to pinpoint the cutoff of parameters to determine the outcome. Because of that, we will use Deciion Tree Model. The rpart package will be used to train the model and rpart.plot R package to create a pretty plot.
library(rpart)
library(rpart.plot)
library(RColorBrewer)
tree <- rpart(Remarks~., data = Train)
rattle::asRules(tree)
## Warning: Failed to load RGtk2 dynamic library, attempting to install it.
## Please install GTK+ from http://r.research.att.com/libs/GTK_2.24.17-X11.pkg
## If the package still does not load, please ensure that GTK+ is installed and that it is on your PATH environment variable
## IN ANY CASE, RESTART R BEFORE TRYING TO LOAD THE PACKAGE AGAIN
##
## Rule number: 2 [Remarks=BTT cover=408 (55%) prob=0.03]
## RDW< 16.15
##
## Rule number: 3 [Remarks=IDA cover=328 (45%) prob=0.00]
## RDW>=16.15
rattle::fancyRpartPlot(tree)
Train_no_RDW <- Train[, -c(8)]
tree <- rpart(Remarks~., data = Train_no_RDW)
rattle::asRules(tree)
##
## Rule number: 2 [Remarks=BTT cover=408 (55%) prob=0.03]
## MCH< 21.92
##
## Rule number: 3 [Remarks=IDA cover=328 (45%) prob=0.00]
## MCH>=21.92
rattle::fancyRpartPlot(tree)
Train_no_MCH <- Train[, -c(6)]
tree <- rpart(Remarks~., data = Train_no_MCH)
rattle::asRules(tree)
##
## Rule number: 2 [Remarks=BTT cover=408 (55%) prob=0.03]
## RDW< 16.15
##
## Rule number: 3 [Remarks=IDA cover=328 (45%) prob=0.00]
## RDW>=16.15
rattle::fancyRpartPlot(tree)
Train_no_MCH_RDW <- Train[, -c(8, 6)]
tree <- rpart(Remarks~., data = Train_no_MCH_RDW)
rattle::asRules(tree)
##
## Rule number: 2 [Remarks=BTT cover=410 (56%) prob=0.03]
## RBC>=4.96
##
## Rule number: 3 [Remarks=IDA cover=326 (44%) prob=0.00]
## RBC< 4.96
rattle::fancyRpartPlot(tree)
Train_no_MCH_MCV_RDW <- Train[, -c(8, 5, 6)]
tree <- rpart(Remarks~., data = Train_no_MCH_MCV_RDW)
rattle::asRules(tree)
##
## Rule number: 2 [Remarks=BTT cover=410 (56%) prob=0.03]
## RBC>=4.96
##
## Rule number: 3 [Remarks=IDA cover=326 (44%) prob=0.00]
## RBC< 4.96
rattle::fancyRpartPlot(tree)
library(ggplot2)
library(grid)
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.1.3
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:ggplot2':
##
## ggsave
names <- c("RBC", "HB", "HCT", "MCV", "MCH", "MCHC", "RDW", "Ferritin")
for (i in names) {
p <- ggplot(df, aes(factor(Remarks), df[, i])) + geom_boxplot(aes(fill = Remarks)) + ggtitle(i) +
theme(
legend.position = ("none"),
plot.margin = unit(c(1, 1, 1, 1), "cm"),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(size = 20, color = "black"),
axis.text.y = element_text(size = 20, color = "black"),
plot.title = element_text(size = 30),
panel.border = element_rect(linetype = "solid", colour = "black",
fill = NA, size = 1))
print(p)
}
rdw <- df[, "RDW"]
mcv <- df[, "MCV"]
mch <- df[, "MCH"]
new <- (rdw*mcv)+mch
new_data <- data.frame(new, outcome = df[, "Remarks"])
p <- ggplot(new_data, aes(factor(outcome), new)) + geom_boxplot(aes(fill = outcome)) + ggtitle("New Formula") +
theme(
legend.position = ("none"),
plot.margin = unit(c(1, 1, 1, 1), "cm"),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(size = 20, color = "black"),
axis.text.y = element_text(size = 20, color = "black"),
plot.title = element_text(size = 30),
panel.border = element_rect(linetype = "solid", colour = "black",
fill = NA, size = 1))
p
rdw <- df[, "RDW"]
mcv <- df[, "MCV"]
mch <- df[, "MCH"]
new <- (rdw*mcv)+mch
new_data <- data.frame(new, df[, "Remarks"])
Train_no_MCH_RDW <- data.frame(Train[, -c(8)], new_data = new_data[, "new"])
tree <- rpart(Remarks~., data = Train_no_MCH_RDW)
rattle::asRules(tree)
##
## Rule number: 2 [Remarks=BTT cover=408 (55%) prob=0.03]
## new_data< 1122
##
## Rule number: 3 [Remarks=IDA cover=328 (45%) prob=0.00]
## new_data>=1122
prediction <- predict(tree, Train_no_MCH_RDW, type = "class")
actual <- Train_no_MCH_RDW$Remarks
caret::confusionMatrix(prediction, actual)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ATT BTT IDA
## ATT 0 0 0
## BTT 12 396 0
## IDA 0 0 328
##
## Overall Statistics
##
## Accuracy : 0.9837
## 95% CI : (0.9717, 0.9915)
## No Information Rate : 0.538
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9676
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: ATT Class: BTT Class: IDA
## Sensitivity 0.0000 1.0000 1.0000
## Specificity 1.0000 0.9647 1.0000
## Pos Pred Value NaN 0.9706 1.0000
## Neg Pred Value 0.9837 1.0000 1.0000
## Prevalence 0.0163 0.5380 0.4457
## Detection Rate 0.0000 0.5380 0.4457
## Detection Prevalence 0.0000 0.5543 0.4457
## Balanced Accuracy 0.5000 0.9824 1.0000
rattle::fancyRpartPlot(tree)