In this R notebook we are going to explore the data analytics and data visualization power of R.
In this example we are going to analyze the heart disease database from UCI machine library.
The dataset contains 76 predictors(features) and 303 observations. Patients with heart disease is binary coded as Presence given as 1 and No Presence as 0. The prerequiste to run in R Markdown is download the CSV data file in your working directory. This can be done by setting the current working directory as folows in R chunk: setwd("C:\\Users\\RajuPC\\Documents\\MyR")
First load the supporting R libraries
setwd("C:\\Users\\RajuPC\\Documents\\MyR") # Setting Woring Directory
library(tidyverse) #A high efficient data viz and manipulation R Library
library(caret) # A collection of Machine Learning Libraries
library(plotly) #A interaction Graphing System
library(ggsci) # A great collection of themes for ggplot
Loading of UCI heart disease data.
#Load the CSV data file
hci<-read_csv("heart.csv")
Parsed with column specification:
cols(
age = col_integer(),
sex = col_integer(),
cp = col_integer(),
trestbps = col_integer(),
chol = col_integer(),
fbs = col_integer(),
restecg = col_integer(),
thalach = col_integer(),
exang = col_integer(),
oldpeak = col_double(),
slope = col_integer(),
ca = col_integer(),
thal = col_integer(),
target = col_integer()
)
hci$sex <- as.character(hci$sex)
hci$sex[hci$sex== 1] <- "Male"
hci$sex[hci$sex== 0] <- "Female"
summary(hci)
age sex cp trestbps chol
Min. :29.00 Length:303 Min. :0.000 Min. : 94.0 Min. :126.0
1st Qu.:47.50 Class :character 1st Qu.:0.000 1st Qu.:120.0 1st Qu.:211.0
Median :55.00 Mode :character Median :1.000 Median :130.0 Median :240.0
Mean :54.37 Mean :0.967 Mean :131.6 Mean :246.3
3rd Qu.:61.00 3rd Qu.:2.000 3rd Qu.:140.0 3rd Qu.:274.5
Max. :77.00 Max. :3.000 Max. :200.0 Max. :564.0
fbs restecg thalach exang oldpeak slope
Min. :0.0000 Min. :0.0000 Min. : 71.0 Min. :0.0000 Min. :0.00 Min. :0.000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.5 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:1.000
Median :0.0000 Median :1.0000 Median :153.0 Median :0.0000 Median :0.80 Median :1.000
Mean :0.1485 Mean :0.5281 Mean :149.6 Mean :0.3267 Mean :1.04 Mean :1.399
3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0 3rd Qu.:1.0000 3rd Qu.:1.60 3rd Qu.:2.000
Max. :1.0000 Max. :2.0000 Max. :202.0 Max. :1.0000 Max. :6.20 Max. :2.000
ca thal target
Min. :0.0000 Min. :0.000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:0.0000
Median :0.0000 Median :2.000 Median :1.0000
Mean :0.7294 Mean :2.314 Mean :0.5446
3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.0000
Max. :4.0000 Max. :3.000 Max. :1.0000
tbl_df(hci)# A nicer view of the data as a table
Convert following predictors as factor for plotting
#Convert following predictors as factor for plotting
hci$sex<-as.factor(hci$sex)
hci$cp<-as.factor(hci$cp)
hci$thal<-as.factor(hci$thal)
hci$ca<-as.factor(hci$ca)
Distribution of Male and Female population across Age parameter
ggplotly(p1<-hci %>% ggplot(aes(x=age,fill=sex))+geom_bar()+xlab("Age") +
ylab("Number")+ guides(fill = guide_legend(title = "Gender"))
)%>% layout(legend = list(orientation = "h", x = 0, y = 1))
Representation of Cholestoral level
p2<-hci %>% ggplot(aes(x=age,y=chol,fill=sex, size=chol))+geom_point(alpha=0.7)+xlab("Age") +
ylab("Cholestoral")+ scale_fill_npg()+guides(fill = guide_legend(title = "Gender"))+
theme(plot.margin = margin(0.1,.1,.1,.1, "cm"))
ggplotly(p2)%>% layout(legend = list(orientation = "h", x = 0, y = 1))
Representation of Cholestoral level across different defect conditions
p3<-hci %>% ggplot(aes(x=age,y=chol,fill=sex, size=chol))+geom_point(alpha=0.7)+xlab("Age") +
ylab("Cholestoral")+facet_grid(.~fbs)+
theme(plot.margin = margin(0.1,.1,.1,.1, "cm"))
#ggsave("p3.png",plot=p3,dpi=300) To save the plot
ggplotly(p3)%>%layout(legend = list(orientation = "h", x = 0, y = 1))
Comparison of Blood pressure across pain type (0~3)
p4<-hci%>%ggplot(aes(x=sex,y=trestbps))+geom_boxplot(fill="darkorange")+xlab("Sex")+ylab("BP")+facet_grid(~cp)
ggplotly(p4)
Comparison of Cholestoral across pain type (0~3)
p5<-hci%>%ggplot(aes(x=sex,y=chol))+geom_boxplot(fill="#D55E00")+xlab("Sex")+ylab("Chol")+facet_grid(~cp)
ggplotly(p5)
Relation between Gender, Age, Cholestoral, BP
# Scatterplot
gg <- ggplot(hci, aes(x=age, y=chol, col=sex)) +
geom_point(aes( size=trestbps),shape=1,alpha=0.6) + theme_bw()+
geom_smooth(method="loess", se=F) +theme(plot.margin = margin(0.1,.1,.1,.1, "cm"))
ggplotly(gg)%>%layout(legend = list(orientation = "h", x = 0, y = 1))
NA
First the data is partitioned into training and test datasets
# Create the training and test datasets
set.seed(100)
hci<-read_csv("heart.csv")
Parsed with column specification:
cols(
age = col_integer(),
sex = col_integer(),
cp = col_integer(),
trestbps = col_integer(),
chol = col_integer(),
fbs = col_integer(),
restecg = col_integer(),
thalach = col_integer(),
exang = col_integer(),
oldpeak = col_double(),
slope = col_integer(),
ca = col_integer(),
thal = col_integer(),
target = col_integer()
)
# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(hci$target, p=0.8, list=FALSE)
# Step 2: Create the training dataset
trainData <- hci[trainRowNumbers,]
# Step 3: Create the test dataset
testData <- hci[-trainRowNumbers,]
# Store X and Y for later use.
x = trainData[, 1:13]
trainData$target[trainData$target==1]<-"P"
trainData$target[trainData$target==0]<-"N"
y=trainData$target
testData$target[testData$target==1]<-"P"
testData$target[testData$target==0]<-"N"
yt=testData$target
# # See the structure of the new dataset
Normalization of features
preProcess_range_model <- preProcess(trainData, method='range')
preProcess_range_model1 <- preProcess(testData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
testData <- predict(preProcess_range_model1, newdata = testData)
# Append the Y variable
trainData$target <- as.factor(y)
testData$target<-as.factor(yt)
#apply(trainData[, 1:13], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
str(trainData)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 243 obs. of 14 variables:
$ age : num 0.25 0.562 0.583 0.562 0.312 ...
$ sex : num 0 1 1 0 1 1 1 1 0 0 ...
$ cp : num 0.333 0.333 0 0.333 0.333 ...
$ trestbps: num 0.34 0.245 0.434 0.434 0.245 ...
$ chol : num 0.178 0.251 0.151 0.384 0.313 ...
$ fbs : num 0 0 0 0 0 1 0 0 1 0 ...
$ restecg : num 0 0.5 0.5 0 0.5 0.5 0.5 0 0 0.5 ...
$ thalach : num 0.771 0.817 0.588 0.626 0.779 ...
$ exang : num 0 0 0 0 0 0 0 1 0 0 ...
$ oldpeak : num 0.2258 0.129 0.0645 0.2097 0 ...
$ slope : num 1 1 0.5 0.5 1 1 1 0.5 1 0.5 ...
$ ca : num 0 0 0 0 0 0 0 0 0 0 ...
$ thal : num 0.667 0.667 0.333 0.667 1 ...
$ target : Factor w/ 2 levels "N","P": 2 2 2 2 2 2 2 2 2 2 ...
str(testData)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 60 obs. of 14 variables:
$ age : num 0.8235 0.0588 0.6471 0.6471 0.5588 ...
$ sex : num 1 1 0 1 1 0 1 1 0 1 ...
$ cp : num 1 0.667 0 0.667 0 ...
$ trestbps: num 0.593 0.419 0.302 0.651 0.535 ...
$ chol : num 0.26693 0.33466 0.749 0.00797 0.29084 ...
$ fbs : num 1 0 0 0 0 0 0 0 1 1 ...
$ restecg : num 0 1 1 1 1 1 1 1 0 0 ...
$ thalach : num 0.619 1 0.753 0.866 0.722 ...
$ exang : num 0 0 1 0 0 0 0 1 0 0 ...
$ oldpeak : num 0.411 0.625 0.107 0.286 0.214 ...
$ slope : num 0 0 1 1 1 1 1 1 1 0 ...
$ ca : num 0 0 0 0 0 0 0 0 0.25 0 ...
$ thal : num 0 0.5 0.5 0.5 0.5 0.5 0.5 1 0.5 0.5 ...
$ target : Factor w/ 2 levels "N","P": 2 2 2 2 2 2 2 2 2 2 ...
Detection of Heart disease by Earth ML method present in caret package
#fit control
fitControl <- trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'final', # saves predictions for optimal tuning parameter
classProbs = T, # should class probabilities be returned
summaryFunction=twoClassSummary # results summary function
)
# Step 1: Tune hyper parameters by setting tuneLength
set.seed(100)
model_mars2 = train(target ~ ., data=trainData, method='earth', tuneLength = 5, metric='ROC', trControl = fitControl)
Loading required package: earth
Loading required package: plotmo
Loading required package: plotrix
Loading required package: TeachingDemos
Attaching package: <U+393C><U+3E31>TeachingDemos<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:plotly<U+393C><U+3E32>:
subplot
model_mars2
Multivariate Adaptive Regression Spline
243 samples
13 predictor
2 classes: 'N', 'P'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 194, 194, 195, 195, 194
Resampling results across tuning parameters:
nprune ROC Sens Spec
2 0.7904281 0.7956710 0.7851852
5 0.8950297 0.7766234 0.8444444
9 0.9031826 0.7952381 0.8666667
13 0.9031826 0.7952381 0.8666667
17 0.9031826 0.7952381 0.8666667
Tuning parameter 'degree' was held constant at a value of 1
ROC was used to select the optimal model using the largest value.
The final values used for the model were nprune = 9 and degree = 1.
# Step 2: Predict on testData and Compute the confusion matrix
predicted2 <- predict(model_mars2, testData)
confusionMatrix(reference = testData$target, data = predicted2, mode='everything')
Confusion Matrix and Statistics
Reference
Prediction N P
N 24 6
P 6 24
Accuracy : 0.8
95% CI : (0.6767, 0.8922)
No Information Rate : 0.5
P-Value [Acc > NIR] : 1.592e-06
Kappa : 0.6
Mcnemar's Test P-Value : 1
Sensitivity : 0.8
Specificity : 0.8
Pos Pred Value : 0.8
Neg Pred Value : 0.8
Precision : 0.8
Recall : 0.8
F1 : 0.8
Prevalence : 0.5
Detection Rate : 0.4
Detection Prevalence : 0.5
Balanced Accuracy : 0.8
'Positive' Class : N
Comparison of some common ML methods using Models_Compare method
# Train the model using adaboost
model_adaboost = train(target ~ ., data=trainData, method='adaboost', tuneLength=2, trControl = fitControl)
The metric "Accuracy" was not in the result set. ROC will be used instead.
model1 = train(target ~ ., data=trainData, method='knn', tuneLength=2, trControl = fitControl)#KNN Model
The metric "Accuracy" was not in the result set. ROC will be used instead.
model2 = train(target ~ ., data=trainData, method='svmRadial', tuneLength=2, trControl = fitControl)#SVM
The metric "Accuracy" was not in the result set. ROC will be used instead.
model2 = train(target ~ ., data=trainData, method='rpart', tuneLength=2, trControl = fitControl)#RandomForest
The metric "Accuracy" was not in the result set. ROC will be used instead.
# Compare model performances using resample()
models_compare <- resamples(list(EARTH=model_mars2,ADABOOST=model_adaboost, KNN=model1,SVM=model2, RanF=model2))
# Summary of the models performances
summary(models_compare)
Call:
summary.resamples(object = models_compare)
Models: EARTH, ADABOOST, KNN, SVM, RanF
Number of resamples: 5
ROC
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
EARTH 0.8668430 0.8989899 0.9048822 0.9031826 0.9175084 0.9276896 0
ADABOOST 0.8451178 0.8483245 0.8569024 0.8618246 0.8783069 0.8804714 0
KNN 0.7989418 0.8552189 0.8821549 0.8865961 0.9478114 0.9488536 0
SVM 0.6693122 0.7154882 0.7710438 0.7625541 0.8227513 0.8341751 0
RanF 0.6693122 0.7154882 0.7710438 0.7625541 0.8227513 0.8341751 0
Sens
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
EARTH 0.6190476 0.7727273 0.8571429 0.7952381 0.8636364 0.8636364 0
ADABOOST 0.6818182 0.7142857 0.7272727 0.7316017 0.7619048 0.7727273 0
KNN 0.6666667 0.7272727 0.8095238 0.7770563 0.8181818 0.8636364 0
SVM 0.5714286 0.7272727 0.7272727 0.7406926 0.7727273 0.9047619 0
RanF 0.5714286 0.7272727 0.7272727 0.7406926 0.7727273 0.9047619 0
Spec
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
EARTH 0.8148148 0.8518519 0.8518519 0.8666667 0.8518519 0.9629630 0
ADABOOST 0.7407407 0.7777778 0.7777778 0.7925926 0.8148148 0.8518519 0
KNN 0.7407407 0.8148148 0.8888889 0.8666667 0.8888889 1.0000000 0
SVM 0.7037037 0.7407407 0.8148148 0.8296296 0.9259259 0.9629630 0
RanF 0.7037037 0.7407407 0.8148148 0.8296296 0.9259259 0.9629630 0
# Draw box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(models_compare, scales=scales)