Recording Link : 2501961994_FinalExam.mkv Import Library Dataset
options(warn=-1)
library(ROCR)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggcorrplot)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(rpart.plot)
#Import Dataset (Even NIM Numbers -> StrokeData.csv)
StrokeData <- read.csv("StrokeData.csv",na.strings=c("N/A"))
head(StrokeData)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 2 Rural 202.21 NA never smoked 1
## 3 Rural 105.92 32.5 never smoked 1
## 4 Urban 171.23 34.4 smokes 1
## 5 Rural 174.12 24.0 never smoked 1
## 6 Urban 186.21 29.0 formerly smoked 1
A. Performing EDA
summary(StrokeData)
## id gender age hypertension
## Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000
## 1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000
## Median :36932 Mode :character Median :45.00 Median :0.00000
## Mean :36518 Mean :43.23 Mean :0.09746
## 3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000
## Max. :72940 Max. :82.00 Max. :1.00000
##
## heart_disease ever_married work_type Residence_type
## Min. :0.00000 Length:5110 Length:5110 Length:5110
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.05401
## 3rd Qu.:0.00000
## Max. :1.00000
##
## avg_glucose_level bmi smoking_status stroke
## Min. : 55.12 Min. :10.30 Length:5110 Min. :0.00000
## 1st Qu.: 77.25 1st Qu.:23.50 Class :character 1st Qu.:0.00000
## Median : 91.89 Median :28.10 Mode :character Median :0.00000
## Mean :106.15 Mean :28.89 Mean :0.04873
## 3rd Qu.:114.09 3rd Qu.:33.10 3rd Qu.:0.00000
## Max. :271.74 Max. :97.60 Max. :1.00000
## NA's :201
colSums(is.na(StrokeData))
## id gender age hypertension
## 0 0 0 0
## heart_disease ever_married work_type Residence_type
## 0 0 0 0
## avg_glucose_level bmi smoking_status stroke
## 0 201 0 0
There are 201 missing value bmi variable.
str(StrokeData)
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr "Private" "Self-employed" "Private" "Private" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : num 36.6 NA 32.5 34.4 24 29 27.4 22.8 NA 24.2 ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
There are 5110 data observations and 12 attributes in this dataset.
Then, there are:
4 Integer Attributes : id, hypertension, heart_disease, stroke
5 Character Attributes : gender, ever_married, work_type, Residence_type, smoking_status
3 Numerical Attributes : age, avg_glucose_level, bmi
unique(StrokeData$gender)
## [1] "Male" "Female" "Other"
unique(StrokeData$ever_married)
## [1] "Yes" "No"
unique(StrokeData$work_type)
## [1] "Private" "Self-employed" "Govt_job" "children"
## [5] "Never_worked"
unique(StrokeData$Residence_type)
## [1] "Urban" "Rural"
unique(StrokeData$smoking_status)
## [1] "formerly smoked" "never smoked" "smokes" "Unknown"
First, change the character attributes into factor, because by looking at its unique value, I conclude that those character attributes are categorical.
StrokeData[,c(2,6:8,11)] <- lapply(StrokeData[,c(2,6:8,11)],as.factor)
#str(StrokeData)
bmi has 201 missing value from 5110 data observations(3.93%), which is small portion and we will remove them from the dataset to make a better logistic regression fit.
StrokeData <- na.omit(StrokeData)
str(StrokeData)
## 'data.frame': 4909 obs. of 12 variables:
## $ id : int 9046 31112 60182 1665 56669 53882 10434 60491 12109 12095 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 2 2 1 1 2 2 1 1 1 1 ...
## $ age : num 67 80 49 79 81 74 69 78 81 61 ...
## $ hypertension : int 0 0 0 1 0 1 0 0 1 0 ...
## $ heart_disease : int 1 1 0 0 0 1 0 0 0 1 ...
## $ ever_married : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
## $ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 4 4 5 4 4 4 4 4 2 ...
## $ Residence_type : Factor w/ 2 levels "Rural","Urban": 2 1 2 1 2 1 2 2 1 1 ...
## $ avg_glucose_level: num 229 106 171 174 186 ...
## $ bmi : num 36.6 32.5 34.4 24 29 27.4 22.8 24.2 29.7 36.8 ...
## $ smoking_status : Factor w/ 4 levels "formerly smoked",..: 1 2 3 2 1 2 2 4 2 3 ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:201] 2 9 14 20 28 30 44 47 51 52 ...
## ..- attr(*, "names")= chr [1:201] "2" "9" "14" "20" ...
#colSums(is.na(StrokeData))
Now, there are 4909 after omitting the N/A value of bmi on the dataset
Check if there is any duplicate data on the dataset
print(sum(duplicated(StrokeData)))
## [1] 0
There is no duplicate data in this dataset. Now, we will make some plot to see relations between attributes.
Univariate Analysis
plot1 <- ggplot(StrokeData,aes(x=bmi))+geom_histogram(fill='indianred3',col='white') + labs(title="bmi distribution on data observations",x='bmi',y='Total')
plot2 <- ggplot(StrokeData,aes(x=gender)) + geom_bar(fill='lightgreen',col='white')+labs(title='Gender Status on data observations',x='Stroke Status',y='Total')
plot3 <- ggplot(StrokeData, aes(x=avg_glucose_level)) + geom_histogram(fill='indianred3',col='white')+ labs(title='Average Glucose Level Distribution', x="avg_glucose_lvl",y='Total')
plot4 <- ggplot(StrokeData, aes(x=Residence_type))+ geom_bar(fill='lightgreen',col='white')+labs(title='Residence Type Barplot',x='Residence Type',y='Total')
grid.arrange(plot1,plot2,plot3,plot4,nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Explanation:
Bivariate Analysis
plot1<-ggplot(StrokeData,aes(x=as.factor(hypertension),fill= as.factor(stroke)))+geom_bar(position='fill')+labs(title="Hypertension Status vs Stroke Status",x='Hypertension')
plot2<-ggplot(StrokeData, aes(as.factor(stroke),avg_glucose_level))+geom_violin(fill='lightgreen') + labs(title="Stroke Status vs Avg_Glucose_Level")
plot3<-ggplot(StrokeData, aes(x=ever_married,fill=as.factor(stroke)))+geom_bar(position="fill",las=0)+labs(title="Ever Married vs Stroke Status",x='Ever Married')
plot4<-ggplot(StrokeData, aes(as.factor(stroke),age))+geom_boxplot(fill='#58D68D',outlier.color='orange')+labs(title='Age vs Stroke Status')
grid.arrange(plot1,plot2,plot3,plot4,nrow=2)
Explanation:
Multivariate Analysis
We will make a correlation plot between numerical variable to check multicollinearity (relation between independent variable)
DfNumerical <-StrokeData[c(3,9,10,12)]
Corr <- cor(DfNumerical, method = 'pearson')
ggcorrplot(Corr,lab=TRUE)+labs(title="Correlation between numerical variable")
Explanation:
Detect Outliers
We will try to detect outliers of bmi attributes using Three Sigma edit rule, Hampel Identifier and Boxplot
#Create divided plot of 1 row and 3 columns
par(mfrow=c(1,3))
#Three Sigma Rule - Data X in dataset y considered outlier between | X - mean(y) | > 3std(y), data that doesn't fulfill that considered as normal
#Normal Data Interval = (Mean - 3std) <= Mean <= (Mean+3Std)
#ylim is setted to show the lower limit line
plot(StrokeData$bmi,xlab='Record Number',ylab='bmi',main='Three Sigma Edit Rule',col='#2E8B57',ylim=c(-10,100),cex.lab=1.5,cex.main=1.5,font.lab=3,font.main=4)
#Line for mean of Data
abline(h=mean(StrokeData$bmi),lty='dashed',col='red',lwd=2)
#Line for upeer limit and lower limit of data
abline(h=mean(StrokeData$bmi)+3*(sd(StrokeData$bmi)),lty='dashed',col='blue',lwd=2)
abline(h=mean(StrokeData$bmi)-3*(sd(StrokeData$bmi)),lty='dashed',col='blue',lwd=2)
legend('topleft',legend=c("Mean","Upper/Lower Lim"),col=c("red",'blue'),lty=2,lwd=1.5)
#Hampel Identifier - Data X in dataset y considered outlier between | X - median(y) | > 3MADM(y), data that doesn't fulfill that considered as normal
#Normal Data Interval = (Median - 3MADM) <= Median <= (Median+3MADM)
plot(StrokeData$bmi,xlab='Record Number',ylab='bmi',main='Hampel Identifier',col='#9ACD32',ylim=c(-10,100),cex.lab=1.5,cex.main=1.5,font.lab=3,font.main=4)
#Line for median of Data
abline(h = median(StrokeData$bmi),lty='dashed',col='red',lwd=2)
#Line for Upper limit and lower limit of data
abline(h=median(StrokeData$bmi)+3*(mad(StrokeData$bmi)),lty='dashed',col='blue',lwd=2)
abline(h=median(StrokeData$bmi)-3*(mad(StrokeData$bmi)),lty='dashed',col='blue',lwd=2)
legend('topleft',legend=c("Median","Upper/Lower Lim"),col=c("red",'blue'),lty=2,lwd=1.5)
#Boxplot Rule = Data x in dataset y Considered outlier when x is bigger than Upper Quartile(quantile(0.75))+1.5IQR or x is smaller than Lower Quartile(quantile(0.75))-1.5IQR
#Normal Data Interval = quantile(0.25)-1.5IQR <= quantile(0.5)/median <= quantile(0.75)+1.5IQR
boxplot(StrokeData$bmi,ylab='bmi',main='Boxplot Rule',col='#9ACD32',ylim=c(-10,100),cex.lab=1.5,cex.main=1.5,font.lab=3,font.main=4)
bmi <- StrokeData$bmi
OutliersThreeSigma <- sum(abs(bmi-mean(bmi))>(3*sd(bmi)))
OutliersHampel <- sum(abs(bmi-median(bmi)) > (3*mad(bmi)))
UpperOutlierBoxplot = sum(bmi > quantile(bmi,0.75) + 1.5*IQR(bmi))
LowerOutlierBoxplot = sum(bmi < quantile(bmi,0.25) - 1.5*IQR(bmi))
OutlierBoxplot = UpperOutlierBoxplot + LowerOutlierBoxplot
sprintf("Total Outliers using Three Sigma Rule : %d",OutliersThreeSigma)
## [1] "Total Outliers using Three Sigma Rule : 58"
sprintf("Total Outliers using Hampel Identifier : %d",OutliersHampel)
## [1] "Total Outliers using Hampel Identifier : 90"
sprintf("Total Outliers using Boxplot Rule : %d",OutlierBoxplot)
## [1] "Total Outliers using Boxplot Rule : 110"
As we could see at the univariate analysis before, the bmi has a roughly normal distribution. So, the mean and median don’t have so much difference, even it is nearly the same.So, by looking at the plot and the sum, we can say that Boxplot Rule gives the most outlier detection by having 110 outliers.
The best outlier detector for me is using Hampel Identifier / Three Sigma rule, because it doesn’t detect too much outlier and effective because the bmi attributes is normally distributed.
B. Data Preparation and the most important features
1. Because ID is just an identifier and doesn’t have any meaning, we will drop the ID column from the dataset
StrokeData <- StrokeData[-c(1)]
head(StrokeData)
## gender age hypertension heart_disease ever_married work_type
## 1 Male 67 0 1 Yes Private
## 3 Male 80 0 1 Yes Private
## 4 Female 49 0 0 Yes Private
## 5 Female 79 1 0 Yes Self-employed
## 6 Male 81 0 0 Yes Private
## 7 Male 74 1 1 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 3 Rural 105.92 32.5 never smoked 1
## 4 Urban 171.23 34.4 smokes 1
## 5 Rural 174.12 24.0 never smoked 1
## 6 Urban 186.21 29.0 formerly smoked 1
## 7 Rural 70.09 27.4 never smoked 1
sum(StrokeData$gender=='Other')
## [1] 1
sum(StrokeData$smoking_status=='Unknown')
## [1] 1483
Now, because there is only 1 Other gender in this dataset, we could drop the levels from gender the dataset and remove the data that has Other gender.
But, the Unknown smoking status has a massive amount of data (30.2%). So, I decide to just make it stay like that instead of removing it
#Remove Other Gender
StrokeData <- subset(StrokeData, gender!='Other')
StrokeData$gender <- droplevels(StrokeData$gender)
str(StrokeData)
## 'data.frame': 4908 obs. of 11 variables:
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 1 1 1 1 ...
## $ age : num 67 80 49 79 81 74 69 78 81 61 ...
## $ hypertension : int 0 0 0 1 0 1 0 0 1 0 ...
## $ heart_disease : int 1 1 0 0 0 1 0 0 0 1 ...
## $ ever_married : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
## $ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 4 4 5 4 4 4 4 4 2 ...
## $ Residence_type : Factor w/ 2 levels "Rural","Urban": 2 1 2 1 2 1 2 2 1 1 ...
## $ avg_glucose_level: num 229 106 171 174 186 ...
## $ bmi : num 36.6 32.5 34.4 24 29 27.4 22.8 24.2 29.7 36.8 ...
## $ smoking_status : Factor w/ 4 levels "formerly smoked",..: 1 2 3 2 1 2 2 4 2 3 ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
To make it all even, I want to change hypertension, heart_disease, and stroke to factor variable, because the variable is binary category that contains either 1 or 0.
StrokeData[StrokeData$hypertension==0,]$hypertension <- "No"
StrokeData[StrokeData$hypertension==1,]$hypertension <- "Yes"
StrokeData[StrokeData$heart_disease==0,]$heart_disease <- "No"
StrokeData[StrokeData$heart_disease==1,]$heart_disease <- "Yes"
StrokeData[StrokeData$stroke==0,]$stroke <- "No"
StrokeData[StrokeData$stroke==1,]$stroke <- "Yes"
StrokeData[,c(3,4,11)] <- lapply(StrokeData[,c(3,4,11)],as.factor)
str(StrokeData)
## 'data.frame': 4908 obs. of 11 variables:
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 1 1 1 1 ...
## $ age : num 67 80 49 79 81 74 69 78 81 61 ...
## $ hypertension : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 2 1 1 2 1 ...
## $ heart_disease : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 1 1 1 2 ...
## $ ever_married : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
## $ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 4 4 5 4 4 4 4 4 2 ...
## $ Residence_type : Factor w/ 2 levels "Rural","Urban": 2 1 2 1 2 1 2 2 1 1 ...
## $ avg_glucose_level: num 229 106 171 174 186 ...
## $ bmi : num 36.6 32.5 34.4 24 29 27.4 22.8 24.2 29.7 36.8 ...
## $ smoking_status : Factor w/ 4 levels "formerly smoked",..: 1 2 3 2 1 2 2 4 2 3 ...
## $ stroke : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
Now, split the dataset into training and testing data with distribution of 80% and 20%
set.seed(1)
validationindex <- createDataPartition(StrokeData$stroke , p=0.80, list =FALSE)
#Create Validation Set by subtract real Dataset by trainingset
validationset <- StrokeData[-validationindex,]
trainingset<- StrokeData[validationindex,]
Now, after doing data preparation, We can say that stroke features is the most important features , because it is the target variable that we need to make a regression fit. Instead of that, the other independent variable is also important thing to predict the dependent variable more accurately.But, without something to predict, all the independent variable is useless, right?
C. Predictive Modeling (Logistic Regression)
First, we will do a logistic regression using glm() by using all the independent variable
fit1 <- glm(stroke ~.,family = binomial(link = "logit"), data = trainingset)
summary(fit1)
##
## Call:
## glm(formula = stroke ~ ., family = binomial(link = "logit"),
## data = trainingset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1999 -0.2929 -0.1487 -0.0727 3.4386
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.227e+00 1.085e+00 -6.661 2.72e-11 ***
## genderMale 9.946e-04 1.721e-01 0.006 0.9954
## age 7.813e-02 7.254e-03 10.772 < 2e-16 ***
## hypertensionYes 4.569e-01 1.968e-01 2.321 0.0203 *
## heart_diseaseYes 2.828e-01 2.328e-01 1.215 0.2244
## ever_marriedYes -1.627e-01 2.744e-01 -0.593 0.5532
## work_typeGovt_job -1.045e+00 1.144e+00 -0.913 0.3611
## work_typeNever_worked -1.109e+01 5.472e+02 -0.020 0.9838
## work_typePrivate -9.711e-01 1.130e+00 -0.860 0.3899
## work_typeSelf-employed -1.336e+00 1.151e+00 -1.160 0.2459
## Residence_typeUrban -1.031e-01 1.674e-01 -0.616 0.5381
## avg_glucose_level 4.304e-03 1.445e-03 2.979 0.0029 **
## bmi 6.171e-03 1.344e-02 0.459 0.6460
## smoking_statusnever smoked 4.652e-02 2.122e-01 0.219 0.8264
## smoking_statussmokes 4.557e-01 2.564e-01 1.778 0.0755 .
## smoking_statusUnknown -2.161e-01 2.792e-01 -0.774 0.4389
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1387.8 on 3927 degrees of freedom
## Residual deviance: 1090.2 on 3912 degrees of freedom
## AIC: 1122.2
##
## Number of Fisher Scoring iterations: 15
After looking at the relation between the independent variable with dependent variable, I exclude the weak correlated independent variable and the remain is age, hypertension and avg_glucose_level
fit2 <- glm(stroke ~age+hypertension+avg_glucose_level,family = binomial(link = "logit"), data = trainingset)
summary(fit2)
##
## Call:
## glm(formula = stroke ~ age + hypertension + avg_glucose_level,
## family = binomial(link = "logit"), data = trainingset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0115 -0.2976 -0.1571 -0.0725 3.6286
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.856184 0.438226 -17.927 < 2e-16 ***
## age 0.072011 0.006217 11.583 < 2e-16 ***
## hypertensionYes 0.504689 0.193934 2.602 0.009258 **
## avg_glucose_level 0.004589 0.001392 3.297 0.000977 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1387.8 on 3927 degrees of freedom
## Residual deviance: 1102.3 on 3924 degrees of freedom
## AIC: 1110.3
##
## Number of Fisher Scoring iterations: 7
Now, the explanation of fit2:
Value of residuals is roughly centered at 0, which is good for a regression fit.
All the remained independent variable has a strong correlation with the dependent variable
AIC is reduced from 1122.2 to 1110.3, which is good to have a fit with lower AIC
Because the fit is good enough, I will make fit2 as my final model
Then, with this fit2, we will do prediction to validationset and evaluate the model
Prediction and Evaluation using ROC Curve + AUC
predictionLogistic <- predict(fit2, newdata = subset(validationset, select = c(1:10)), type = "response")
evaluation <- prediction(predictionLogistic, validationset$stroke)
prf <-performance(evaluation, measure = "tpr", x.measure = "fpr")
plot(prf)
After using evaluation using True Positive Rate and False Positive Rate to create ROC Curve , it shown that the data has a normal graph and has a linear relation between tpr and fpr
auc <- performance(evaluation, measure="auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8318917
0.83 Value of AUC is a good sign that our model performed well. Because, a good AUC value is closer to 1 that shows that it was a perfect fit
result <- ifelse(predictionLogistic>0.5, "Yes", "No")
misclassificationError <- mean(result != validationset$stroke)
print(paste("Accuracy : ", 1-misclassificationError))
## [1] "Accuracy : 0.958163265306123"
After count the accuracy, it shown that the data has accuracy of 95,82% .
Decision Tree
create decision tree model using rpart() using all the dependent variable and using cp(complexity parameter) 0 to reduce complexity of the data
modelDT <- rpart(stroke ~ . , data = trainingset, method = "class",cp=0)
modelDT
## n= 3928
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3928 168 No (0.95723014 0.04276986)
## 2) age< 67.5 3292 67 No (0.97964763 0.02035237) *
## 3) age>=67.5 636 101 No (0.84119497 0.15880503)
## 6) avg_glucose_level< 126.84 430 56 No (0.86976744 0.13023256)
## 12) age< 79.5 327 33 No (0.89908257 0.10091743) *
## 13) age>=79.5 103 23 No (0.77669903 0.22330097)
## 26) avg_glucose_level< 64.295 15 0 No (1.00000000 0.00000000) *
## 27) avg_glucose_level>=64.295 88 23 No (0.73863636 0.26136364)
## 54) avg_glucose_level>=70.305 80 17 No (0.78750000 0.21250000)
## 108) smoking_status=formerly smoked,smokes,Unknown 41 4 No (0.90243902 0.09756098) *
## 109) smoking_status=never smoked 39 13 No (0.66666667 0.33333333)
## 218) bmi>=23.6 32 8 No (0.75000000 0.25000000) *
## 219) bmi< 23.6 7 2 Yes (0.28571429 0.71428571) *
## 55) avg_glucose_level< 70.305 8 2 Yes (0.25000000 0.75000000) *
## 7) avg_glucose_level>=126.84 206 45 No (0.78155340 0.21844660)
## 14) avg_glucose_level>=135.06 196 40 No (0.79591837 0.20408163)
## 28) work_type=Self-employed 76 10 No (0.86842105 0.13157895) *
## 29) work_type=Govt_job,Private 120 30 No (0.75000000 0.25000000)
## 58) avg_glucose_level< 219.625 79 14 No (0.82278481 0.17721519) *
## 59) avg_glucose_level>=219.625 41 16 No (0.60975610 0.39024390)
## 118) bmi< 31.05 24 6 No (0.75000000 0.25000000) *
## 119) bmi>=31.05 17 7 Yes (0.41176471 0.58823529) *
## 15) avg_glucose_level< 135.06 10 5 No (0.50000000 0.50000000) *
Then , we make a plot of the decision tree above
rpart.plot(modelDT,extra=1)
It would be complex if we explain all of them, so I just gonna explain the “Yes” Stroke Status
People that got stroke has a characteristic of
Check the importance of each variable
modelDT$variable.importance
## age avg_glucose_level bmi smoking_status
## 23.3912176 12.7897086 5.7619755 2.3560898
## work_type heart_disease Residence_type hypertension
## 1.4759833 1.2209688 0.5357414 0.1709350
Age and avg_glucose_level is the most importance data in the Decision Tree
Predict and Evaluate using Decision Tree
predictionDT<-predict(modelDT, validationset, type="class")
cm<-table(predictionDT, validationset$stroke)
cm
##
## predictionDT No Yes
## No 934 38
## Yes 5 3
Now, after create the table correlation matrix, we will count the accuracy of the model
#overall accuracy
sum(diag(cm)) / sum(cm)
## [1] 0.9561224
#misclassification accuracy
1- sum(diag(cm)) / sum(cm)
## [1] 0.04387755
The accuracy of the Decision Tree model is 95,61% and error miscalculate is around 4,39%