Recording Link : 2501961994_FinalExam.mkv Import Library Dataset

options(warn=-1)
library(ROCR)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggcorrplot)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)
library(rpart.plot)
#Import Dataset (Even NIM Numbers -> StrokeData.csv)
StrokeData <- read.csv("StrokeData.csv",na.strings=c("N/A"))
head(StrokeData)
##      id gender age hypertension heart_disease ever_married     work_type
## 1  9046   Male  67            0             1          Yes       Private
## 2 51676 Female  61            0             0          Yes Self-employed
## 3 31112   Male  80            0             1          Yes       Private
## 4 60182 Female  49            0             0          Yes       Private
## 5  1665 Female  79            1             0          Yes Self-employed
## 6 56669   Male  81            0             0          Yes       Private
##   Residence_type avg_glucose_level  bmi  smoking_status stroke
## 1          Urban            228.69 36.6 formerly smoked      1
## 2          Rural            202.21   NA    never smoked      1
## 3          Rural            105.92 32.5    never smoked      1
## 4          Urban            171.23 34.4          smokes      1
## 5          Rural            174.12 24.0    never smoked      1
## 6          Urban            186.21 29.0 formerly smoked      1

A. Performing EDA

summary(StrokeData)
##        id           gender               age         hypertension    
##  Min.   :   67   Length:5110        Min.   : 0.08   Min.   :0.00000  
##  1st Qu.:17741   Class :character   1st Qu.:25.00   1st Qu.:0.00000  
##  Median :36932   Mode  :character   Median :45.00   Median :0.00000  
##  Mean   :36518                      Mean   :43.23   Mean   :0.09746  
##  3rd Qu.:54682                      3rd Qu.:61.00   3rd Qu.:0.00000  
##  Max.   :72940                      Max.   :82.00   Max.   :1.00000  
##                                                                      
##  heart_disease     ever_married        work_type         Residence_type    
##  Min.   :0.00000   Length:5110        Length:5110        Length:5110       
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05401                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##                                                                            
##  avg_glucose_level      bmi        smoking_status         stroke       
##  Min.   : 55.12    Min.   :10.30   Length:5110        Min.   :0.00000  
##  1st Qu.: 77.25    1st Qu.:23.50   Class :character   1st Qu.:0.00000  
##  Median : 91.89    Median :28.10   Mode  :character   Median :0.00000  
##  Mean   :106.15    Mean   :28.89                      Mean   :0.04873  
##  3rd Qu.:114.09    3rd Qu.:33.10                      3rd Qu.:0.00000  
##  Max.   :271.74    Max.   :97.60                      Max.   :1.00000  
##                    NA's   :201
colSums(is.na(StrokeData))
##                id            gender               age      hypertension 
##                 0                 0                 0                 0 
##     heart_disease      ever_married         work_type    Residence_type 
##                 0                 0                 0                 0 
## avg_glucose_level               bmi    smoking_status            stroke 
##                 0               201                 0                 0

There are 201 missing value bmi variable.

str(StrokeData)
## 'data.frame':    5110 obs. of  12 variables:
##  $ id               : int  9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
##  $ gender           : chr  "Male" "Female" "Male" "Female" ...
##  $ age              : num  67 61 80 49 79 81 74 69 59 78 ...
##  $ hypertension     : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ heart_disease    : int  1 0 1 0 0 0 1 0 0 0 ...
##  $ ever_married     : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ work_type        : chr  "Private" "Self-employed" "Private" "Private" ...
##  $ Residence_type   : chr  "Urban" "Rural" "Rural" "Urban" ...
##  $ avg_glucose_level: num  229 202 106 171 174 ...
##  $ bmi              : num  36.6 NA 32.5 34.4 24 29 27.4 22.8 NA 24.2 ...
##  $ smoking_status   : chr  "formerly smoked" "never smoked" "never smoked" "smokes" ...
##  $ stroke           : int  1 1 1 1 1 1 1 1 1 1 ...

There are 5110 data observations and 12 attributes in this dataset.

Then, there are:

4 Integer Attributes : id, hypertension, heart_disease, stroke

5 Character Attributes : gender, ever_married, work_type, Residence_type, smoking_status

3 Numerical Attributes : age, avg_glucose_level, bmi

unique(StrokeData$gender)
## [1] "Male"   "Female" "Other"
unique(StrokeData$ever_married)
## [1] "Yes" "No"
unique(StrokeData$work_type)
## [1] "Private"       "Self-employed" "Govt_job"      "children"     
## [5] "Never_worked"
unique(StrokeData$Residence_type)
## [1] "Urban" "Rural"
unique(StrokeData$smoking_status)
## [1] "formerly smoked" "never smoked"    "smokes"          "Unknown"

First, change the character attributes into factor, because by looking at its unique value, I conclude that those character attributes are categorical.

StrokeData[,c(2,6:8,11)] <- lapply(StrokeData[,c(2,6:8,11)],as.factor)
#str(StrokeData)

bmi has 201 missing value from 5110 data observations(3.93%), which is small portion and we will remove them from the dataset to make a better logistic regression fit.

StrokeData <- na.omit(StrokeData)
str(StrokeData)
## 'data.frame':    4909 obs. of  12 variables:
##  $ id               : int  9046 31112 60182 1665 56669 53882 10434 60491 12109 12095 ...
##  $ gender           : Factor w/ 3 levels "Female","Male",..: 2 2 1 1 2 2 1 1 1 1 ...
##  $ age              : num  67 80 49 79 81 74 69 78 81 61 ...
##  $ hypertension     : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ heart_disease    : int  1 1 0 0 0 1 0 0 0 1 ...
##  $ ever_married     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
##  $ work_type        : Factor w/ 5 levels "children","Govt_job",..: 4 4 4 5 4 4 4 4 4 2 ...
##  $ Residence_type   : Factor w/ 2 levels "Rural","Urban": 2 1 2 1 2 1 2 2 1 1 ...
##  $ avg_glucose_level: num  229 106 171 174 186 ...
##  $ bmi              : num  36.6 32.5 34.4 24 29 27.4 22.8 24.2 29.7 36.8 ...
##  $ smoking_status   : Factor w/ 4 levels "formerly smoked",..: 1 2 3 2 1 2 2 4 2 3 ...
##  $ stroke           : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:201] 2 9 14 20 28 30 44 47 51 52 ...
##   ..- attr(*, "names")= chr [1:201] "2" "9" "14" "20" ...
#colSums(is.na(StrokeData))

Now, there are 4909 after omitting the N/A value of bmi on the dataset

Check if there is any duplicate data on the dataset

print(sum(duplicated(StrokeData)))
## [1] 0

There is no duplicate data in this dataset. Now, we will make some plot to see relations between attributes.

Univariate Analysis

plot1 <- ggplot(StrokeData,aes(x=bmi))+geom_histogram(fill='indianred3',col='white') + labs(title="bmi distribution on data observations",x='bmi',y='Total')

plot2 <- ggplot(StrokeData,aes(x=gender)) + geom_bar(fill='lightgreen',col='white')+labs(title='Gender Status on data observations',x='Stroke Status',y='Total')

plot3 <- ggplot(StrokeData, aes(x=avg_glucose_level)) + geom_histogram(fill='indianred3',col='white')+ labs(title='Average Glucose Level Distribution', x="avg_glucose_lvl",y='Total')

plot4 <- ggplot(StrokeData, aes(x=Residence_type))+ geom_bar(fill='lightgreen',col='white')+labs(title='Residence Type Barplot',x='Residence Type',y='Total')

grid.arrange(plot1,plot2,plot3,plot4,nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Explanation:

  1. Both Glucose and bmi has a roughly normal distribution.
  2. There are around 2800-2900 Female and 1800-1900 Male in this dataset, shown that there are more female data observations than male data observations
  3. There are roughly even amount of People who lives in Rural and Urban residence in this dataset.

Bivariate Analysis

plot1<-ggplot(StrokeData,aes(x=as.factor(hypertension),fill= as.factor(stroke)))+geom_bar(position='fill')+labs(title="Hypertension Status vs Stroke Status",x='Hypertension')

plot2<-ggplot(StrokeData, aes(as.factor(stroke),avg_glucose_level))+geom_violin(fill='lightgreen') + labs(title="Stroke Status vs Avg_Glucose_Level")

plot3<-ggplot(StrokeData, aes(x=ever_married,fill=as.factor(stroke)))+geom_bar(position="fill",las=0)+labs(title="Ever Married vs Stroke Status",x='Ever Married')

plot4<-ggplot(StrokeData, aes(as.factor(stroke),age))+geom_boxplot(fill='#58D68D',outlier.color='orange')+labs(title='Age vs Stroke Status')

grid.arrange(plot1,plot2,plot3,plot4,nrow=2)

Explanation:

  1. People that has hypertension more likely to get stroke than people who doesn’t have it.
  2. The more average glucose level , the more likely people will get stroke
  3. People that has ever married has slight chance to get stroke than people who doesn’t
  4. People will likely to get stroke when they become older, in this data the average age of people who get stroke is in range between 60-80 years old

Multivariate Analysis

We will make a correlation plot between numerical variable to check multicollinearity (relation between independent variable)

DfNumerical <-StrokeData[c(3,9,10,12)]
Corr <- cor(DfNumerical, method = 'pearson')
ggcorrplot(Corr,lab=TRUE)+labs(title="Correlation between numerical variable")

Explanation:

  1. There is no strong correlation between independent variable, so there is no multicollinearity happened in this dataset
  2. The relation between numerical independent and the predictor variable(dependent variable) is very weak.

Detect Outliers

We will try to detect outliers of bmi attributes using Three Sigma edit rule, Hampel Identifier and Boxplot

#Create divided plot of 1 row and 3 columns
par(mfrow=c(1,3))


#Three Sigma Rule - Data X in dataset y considered outlier between | X - mean(y) |  >  3std(y), data that doesn't fulfill that considered as normal
#Normal Data Interval = (Mean - 3std) <= Mean <= (Mean+3Std)

#ylim is setted to show the lower limit line
plot(StrokeData$bmi,xlab='Record Number',ylab='bmi',main='Three Sigma Edit Rule',col='#2E8B57',ylim=c(-10,100),cex.lab=1.5,cex.main=1.5,font.lab=3,font.main=4)
#Line for mean of Data
abline(h=mean(StrokeData$bmi),lty='dashed',col='red',lwd=2)
#Line for upeer limit and lower limit of data
abline(h=mean(StrokeData$bmi)+3*(sd(StrokeData$bmi)),lty='dashed',col='blue',lwd=2)
abline(h=mean(StrokeData$bmi)-3*(sd(StrokeData$bmi)),lty='dashed',col='blue',lwd=2)
legend('topleft',legend=c("Mean","Upper/Lower Lim"),col=c("red",'blue'),lty=2,lwd=1.5)

#Hampel Identifier - Data X in dataset y considered outlier between | X - median(y) |  >  3MADM(y), data that doesn't fulfill that considered as normal
#Normal Data Interval = (Median - 3MADM) <= Median <= (Median+3MADM)

plot(StrokeData$bmi,xlab='Record Number',ylab='bmi',main='Hampel Identifier',col='#9ACD32',ylim=c(-10,100),cex.lab=1.5,cex.main=1.5,font.lab=3,font.main=4)
#Line for median of Data
abline(h = median(StrokeData$bmi),lty='dashed',col='red',lwd=2)
#Line for Upper limit and lower limit of data
abline(h=median(StrokeData$bmi)+3*(mad(StrokeData$bmi)),lty='dashed',col='blue',lwd=2)
abline(h=median(StrokeData$bmi)-3*(mad(StrokeData$bmi)),lty='dashed',col='blue',lwd=2)
legend('topleft',legend=c("Median","Upper/Lower Lim"),col=c("red",'blue'),lty=2,lwd=1.5)

#Boxplot Rule = Data x in dataset y Considered outlier when x is bigger than Upper Quartile(quantile(0.75))+1.5IQR or x is smaller than Lower Quartile(quantile(0.75))-1.5IQR
#Normal Data Interval = quantile(0.25)-1.5IQR <= quantile(0.5)/median <= quantile(0.75)+1.5IQR
boxplot(StrokeData$bmi,ylab='bmi',main='Boxplot Rule',col='#9ACD32',ylim=c(-10,100),cex.lab=1.5,cex.main=1.5,font.lab=3,font.main=4)

bmi <- StrokeData$bmi
OutliersThreeSigma <- sum(abs(bmi-mean(bmi))>(3*sd(bmi)))
OutliersHampel <- sum(abs(bmi-median(bmi)) > (3*mad(bmi)))

UpperOutlierBoxplot = sum(bmi > quantile(bmi,0.75) + 1.5*IQR(bmi))
LowerOutlierBoxplot = sum(bmi < quantile(bmi,0.25) - 1.5*IQR(bmi))

OutlierBoxplot = UpperOutlierBoxplot + LowerOutlierBoxplot

sprintf("Total Outliers using Three Sigma Rule : %d",OutliersThreeSigma)
## [1] "Total Outliers using Three Sigma Rule : 58"
sprintf("Total Outliers using Hampel Identifier : %d",OutliersHampel)
## [1] "Total Outliers using Hampel Identifier : 90"
sprintf("Total Outliers using Boxplot Rule : %d",OutlierBoxplot)
## [1] "Total Outliers using Boxplot Rule : 110"

As we could see at the univariate analysis before, the bmi has a roughly normal distribution. So, the mean and median don’t have so much difference, even it is nearly the same.So, by looking at the plot and the sum, we can say that Boxplot Rule gives the most outlier detection by having 110 outliers.

The best outlier detector for me is using Hampel Identifier / Three Sigma rule, because it doesn’t detect too much outlier and effective because the bmi attributes is normally distributed.

B. Data Preparation and the most important features

1. Because ID is just an identifier and doesn’t have any meaning, we will drop the ID column from the dataset

StrokeData <- StrokeData[-c(1)]
head(StrokeData)
##   gender age hypertension heart_disease ever_married     work_type
## 1   Male  67            0             1          Yes       Private
## 3   Male  80            0             1          Yes       Private
## 4 Female  49            0             0          Yes       Private
## 5 Female  79            1             0          Yes Self-employed
## 6   Male  81            0             0          Yes       Private
## 7   Male  74            1             1          Yes       Private
##   Residence_type avg_glucose_level  bmi  smoking_status stroke
## 1          Urban            228.69 36.6 formerly smoked      1
## 3          Rural            105.92 32.5    never smoked      1
## 4          Urban            171.23 34.4          smokes      1
## 5          Rural            174.12 24.0    never smoked      1
## 6          Urban            186.21 29.0 formerly smoked      1
## 7          Rural             70.09 27.4    never smoked      1
  1. Check the “weird” levels in factor data , which is “Other” in gender attributes and “Unknown” in smoking_status
sum(StrokeData$gender=='Other')
## [1] 1
sum(StrokeData$smoking_status=='Unknown')
## [1] 1483

Now, because there is only 1 Other gender in this dataset, we could drop the levels from gender the dataset and remove the data that has Other gender.

But, the Unknown smoking status has a massive amount of data (30.2%). So, I decide to just make it stay like that instead of removing it

#Remove Other Gender
StrokeData <- subset(StrokeData, gender!='Other')
StrokeData$gender <- droplevels(StrokeData$gender)
str(StrokeData)
## 'data.frame':    4908 obs. of  11 variables:
##  $ gender           : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 1 1 1 1 ...
##  $ age              : num  67 80 49 79 81 74 69 78 81 61 ...
##  $ hypertension     : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ heart_disease    : int  1 1 0 0 0 1 0 0 0 1 ...
##  $ ever_married     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
##  $ work_type        : Factor w/ 5 levels "children","Govt_job",..: 4 4 4 5 4 4 4 4 4 2 ...
##  $ Residence_type   : Factor w/ 2 levels "Rural","Urban": 2 1 2 1 2 1 2 2 1 1 ...
##  $ avg_glucose_level: num  229 106 171 174 186 ...
##  $ bmi              : num  36.6 32.5 34.4 24 29 27.4 22.8 24.2 29.7 36.8 ...
##  $ smoking_status   : Factor w/ 4 levels "formerly smoked",..: 1 2 3 2 1 2 2 4 2 3 ...
##  $ stroke           : int  1 1 1 1 1 1 1 1 1 1 ...

To make it all even, I want to change hypertension, heart_disease, and stroke to factor variable, because the variable is binary category that contains either 1 or 0.

StrokeData[StrokeData$hypertension==0,]$hypertension <- "No"
StrokeData[StrokeData$hypertension==1,]$hypertension <- "Yes"

StrokeData[StrokeData$heart_disease==0,]$heart_disease <- "No"
StrokeData[StrokeData$heart_disease==1,]$heart_disease <- "Yes"

StrokeData[StrokeData$stroke==0,]$stroke <- "No"
StrokeData[StrokeData$stroke==1,]$stroke <- "Yes"

StrokeData[,c(3,4,11)] <- lapply(StrokeData[,c(3,4,11)],as.factor)


str(StrokeData)
## 'data.frame':    4908 obs. of  11 variables:
##  $ gender           : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 1 1 1 1 ...
##  $ age              : num  67 80 49 79 81 74 69 78 81 61 ...
##  $ hypertension     : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 2 1 1 2 1 ...
##  $ heart_disease    : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 1 1 1 2 ...
##  $ ever_married     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
##  $ work_type        : Factor w/ 5 levels "children","Govt_job",..: 4 4 4 5 4 4 4 4 4 2 ...
##  $ Residence_type   : Factor w/ 2 levels "Rural","Urban": 2 1 2 1 2 1 2 2 1 1 ...
##  $ avg_glucose_level: num  229 106 171 174 186 ...
##  $ bmi              : num  36.6 32.5 34.4 24 29 27.4 22.8 24.2 29.7 36.8 ...
##  $ smoking_status   : Factor w/ 4 levels "formerly smoked",..: 1 2 3 2 1 2 2 4 2 3 ...
##  $ stroke           : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...

Now, split the dataset into training and testing data with distribution of 80% and 20%

set.seed(1)
validationindex <- createDataPartition(StrokeData$stroke , p=0.80, list =FALSE)

#Create Validation Set by subtract real Dataset by trainingset
validationset <- StrokeData[-validationindex,]
trainingset<- StrokeData[validationindex,]

Now, after doing data preparation, We can say that stroke features is the most important features , because it is the target variable that we need to make a regression fit. Instead of that, the other independent variable is also important thing to predict the dependent variable more accurately.But, without something to predict, all the independent variable is useless, right?

C. Predictive Modeling (Logistic Regression)

First, we will do a logistic regression using glm() by using all the independent variable

fit1 <- glm(stroke ~.,family = binomial(link = "logit"), data = trainingset)
summary(fit1)
## 
## Call:
## glm(formula = stroke ~ ., family = binomial(link = "logit"), 
##     data = trainingset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1999  -0.2929  -0.1487  -0.0727   3.4386  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -7.227e+00  1.085e+00  -6.661 2.72e-11 ***
## genderMale                  9.946e-04  1.721e-01   0.006   0.9954    
## age                         7.813e-02  7.254e-03  10.772  < 2e-16 ***
## hypertensionYes             4.569e-01  1.968e-01   2.321   0.0203 *  
## heart_diseaseYes            2.828e-01  2.328e-01   1.215   0.2244    
## ever_marriedYes            -1.627e-01  2.744e-01  -0.593   0.5532    
## work_typeGovt_job          -1.045e+00  1.144e+00  -0.913   0.3611    
## work_typeNever_worked      -1.109e+01  5.472e+02  -0.020   0.9838    
## work_typePrivate           -9.711e-01  1.130e+00  -0.860   0.3899    
## work_typeSelf-employed     -1.336e+00  1.151e+00  -1.160   0.2459    
## Residence_typeUrban        -1.031e-01  1.674e-01  -0.616   0.5381    
## avg_glucose_level           4.304e-03  1.445e-03   2.979   0.0029 ** 
## bmi                         6.171e-03  1.344e-02   0.459   0.6460    
## smoking_statusnever smoked  4.652e-02  2.122e-01   0.219   0.8264    
## smoking_statussmokes        4.557e-01  2.564e-01   1.778   0.0755 .  
## smoking_statusUnknown      -2.161e-01  2.792e-01  -0.774   0.4389    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1387.8  on 3927  degrees of freedom
## Residual deviance: 1090.2  on 3912  degrees of freedom
## AIC: 1122.2
## 
## Number of Fisher Scoring iterations: 15

After looking at the relation between the independent variable with dependent variable, I exclude the weak correlated independent variable and the remain is age, hypertension and avg_glucose_level

fit2 <- glm(stroke ~age+hypertension+avg_glucose_level,family = binomial(link = "logit"), data = trainingset)
summary(fit2)
## 
## Call:
## glm(formula = stroke ~ age + hypertension + avg_glucose_level, 
##     family = binomial(link = "logit"), data = trainingset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0115  -0.2976  -0.1571  -0.0725   3.6286  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -7.856184   0.438226 -17.927  < 2e-16 ***
## age                0.072011   0.006217  11.583  < 2e-16 ***
## hypertensionYes    0.504689   0.193934   2.602 0.009258 ** 
## avg_glucose_level  0.004589   0.001392   3.297 0.000977 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1387.8  on 3927  degrees of freedom
## Residual deviance: 1102.3  on 3924  degrees of freedom
## AIC: 1110.3
## 
## Number of Fisher Scoring iterations: 7

Now, the explanation of fit2:

  1. Value of residuals is roughly centered at 0, which is good for a regression fit.

  2. All the remained independent variable has a strong correlation with the dependent variable

  3. AIC is reduced from 1122.2 to 1110.3, which is good to have a fit with lower AIC

Because the fit is good enough, I will make fit2 as my final model

Then, with this fit2, we will do prediction to validationset and evaluate the model

Prediction and Evaluation using ROC Curve + AUC

predictionLogistic <- predict(fit2, newdata = subset(validationset, select = c(1:10)), type = "response")
evaluation <- prediction(predictionLogistic, validationset$stroke)
prf <-performance(evaluation, measure = "tpr", x.measure = "fpr")
plot(prf)

After using evaluation using True Positive Rate and False Positive Rate to create ROC Curve , it shown that the data has a normal graph and has a linear relation between tpr and fpr

auc <- performance(evaluation, measure="auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8318917

0.83 Value of AUC is a good sign that our model performed well. Because, a good AUC value is closer to 1 that shows that it was a perfect fit

result <- ifelse(predictionLogistic>0.5, "Yes", "No")
misclassificationError <- mean(result != validationset$stroke)
print(paste("Accuracy : ", 1-misclassificationError))
## [1] "Accuracy :  0.958163265306123"

After count the accuracy, it shown that the data has accuracy of 95,82% .

Decision Tree

create decision tree model using rpart() using all the dependent variable and using cp(complexity parameter) 0 to reduce complexity of the data

modelDT <- rpart(stroke ~ . , data = trainingset, method = "class",cp=0)
modelDT
## n= 3928 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 3928 168 No (0.95723014 0.04276986)  
##     2) age< 67.5 3292  67 No (0.97964763 0.02035237) *
##     3) age>=67.5 636 101 No (0.84119497 0.15880503)  
##       6) avg_glucose_level< 126.84 430  56 No (0.86976744 0.13023256)  
##        12) age< 79.5 327  33 No (0.89908257 0.10091743) *
##        13) age>=79.5 103  23 No (0.77669903 0.22330097)  
##          26) avg_glucose_level< 64.295 15   0 No (1.00000000 0.00000000) *
##          27) avg_glucose_level>=64.295 88  23 No (0.73863636 0.26136364)  
##            54) avg_glucose_level>=70.305 80  17 No (0.78750000 0.21250000)  
##             108) smoking_status=formerly smoked,smokes,Unknown 41   4 No (0.90243902 0.09756098) *
##             109) smoking_status=never smoked 39  13 No (0.66666667 0.33333333)  
##               218) bmi>=23.6 32   8 No (0.75000000 0.25000000) *
##               219) bmi< 23.6 7   2 Yes (0.28571429 0.71428571) *
##            55) avg_glucose_level< 70.305 8   2 Yes (0.25000000 0.75000000) *
##       7) avg_glucose_level>=126.84 206  45 No (0.78155340 0.21844660)  
##        14) avg_glucose_level>=135.06 196  40 No (0.79591837 0.20408163)  
##          28) work_type=Self-employed 76  10 No (0.86842105 0.13157895) *
##          29) work_type=Govt_job,Private 120  30 No (0.75000000 0.25000000)  
##            58) avg_glucose_level< 219.625 79  14 No (0.82278481 0.17721519) *
##            59) avg_glucose_level>=219.625 41  16 No (0.60975610 0.39024390)  
##             118) bmi< 31.05 24   6 No (0.75000000 0.25000000) *
##             119) bmi>=31.05 17   7 Yes (0.41176471 0.58823529) *
##        15) avg_glucose_level< 135.06 10   5 No (0.50000000 0.50000000) *

Then , we make a plot of the decision tree above

rpart.plot(modelDT,extra=1)

It would be complex if we explain all of them, so I just gonna explain the “Yes” Stroke Status

People that got stroke has a characteristic of

  1. Age >= 68 +Have avg_glucose_level < 127 + age >=80 + avg_glucose_level >= 84 + avg_glucose_level < 70
  2. Age >= 68 +Have avg_glucose_level < 127 + age >=80 + avg_glucose_level >=84 + avg_glucose_level >=70 + smoking_status != (formerly smoked, smokes, and unknown) + bmi <24
  3. Age >= 68 +Have avg_glucose_level >=127 + avg_glucose_level >=136 + work-type!= self-employeed + avg_glucose_level >=220 + bmi >=31

Check the importance of each variable

modelDT$variable.importance
##               age avg_glucose_level               bmi    smoking_status 
##        23.3912176        12.7897086         5.7619755         2.3560898 
##         work_type     heart_disease    Residence_type      hypertension 
##         1.4759833         1.2209688         0.5357414         0.1709350

Age and avg_glucose_level is the most importance data in the Decision Tree

Predict and Evaluate using Decision Tree

predictionDT<-predict(modelDT, validationset, type="class")
cm<-table(predictionDT, validationset$stroke)
cm
##             
## predictionDT  No Yes
##          No  934  38
##          Yes   5   3

Now, after create the table correlation matrix, we will count the accuracy of the model

#overall accuracy
sum(diag(cm)) / sum(cm)
## [1] 0.9561224
#misclassification accuracy
1- sum(diag(cm)) / sum(cm)
## [1] 0.04387755

The accuracy of the Decision Tree model is 95,61% and error miscalculate is around 4,39%