library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(rpart)
library(rpart.plot)
library(ROCR)
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
##1. A. Explanatory Data Analysis (EDA)
#Import Data
DataStroke <- read.csv("C:\\Semester 2\\DMV\\StrokeData.csv")
head(DataStroke)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 2 Rural 202.21 N/A never smoked 1
## 3 Rural 105.92 32.5 never smoked 1
## 4 Urban 171.23 34.4 smokes 1
## 5 Rural 174.12 24 never smoked 1
## 6 Urban 186.21 29 formerly smoked 1
#Check Dimension
dim(DataStroke)
## [1] 5110 12
Explanation : 5110 rows and 12 columns
#Check Data Type
sapply(DataStroke, class)
## id gender age hypertension
## "integer" "character" "numeric" "integer"
## heart_disease ever_married work_type Residence_type
## "integer" "character" "character" "character"
## avg_glucose_level bmi smoking_status stroke
## "numeric" "character" "character" "integer"
Explanation : 6 character, 4 integer, and 2 numeric variable #Cek tipe data dari setiap kolom, ada 6 variabel karakter, 4 variabel integer, dan 2 variabel numerik.
#Convert to factor
DataStroke$gender <- as.factor(DataStroke$gender)
DataStroke$ever_married <- as.factor(DataStroke$ever_married)
DataStroke$smoking_status <- as.factor(DataStroke$smoking_status)
DataStroke$Residence_type <- as.factor(DataStroke$Residence_type)
DataStroke$work_type <- as.factor(DataStroke$work_type)
#Desciptive Analysis
describe(DataStroke)
## DataStroke
##
## 12 Variables 5110 Observations
## --------------------------------------------------------------------------------
## id
## n missing distinct Info Mean Gmd .05 .10
## 5110 0 5110 1 36518 24436 3590 6972
## .25 .50 .75 .90 .95
## 17741 36932 54682 65668 69218
##
## lowest : 67 77 84 91 99, highest: 72911 72914 72915 72918 72940
## --------------------------------------------------------------------------------
## gender
## n missing distinct
## 5110 0 3
##
## Value Female Male Other
## Frequency 2994 2115 1
## Proportion 0.586 0.414 0.000
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 5110 0 104 1 43.23 26.03 5 11
## .25 .50 .75 .90 .95
## 25 45 61 75 79
##
## lowest : 0.08 0.16 0.24 0.32 0.40, highest: 78.00 79.00 80.00 81.00 82.00
## --------------------------------------------------------------------------------
## hypertension
## n missing distinct Info Sum Mean Gmd
## 5110 0 2 0.264 498 0.09746 0.176
##
## --------------------------------------------------------------------------------
## heart_disease
## n missing distinct Info Sum Mean Gmd
## 5110 0 2 0.153 276 0.05401 0.1022
##
## --------------------------------------------------------------------------------
## ever_married
## n missing distinct
## 5110 0 2
##
## Value No Yes
## Frequency 1757 3353
## Proportion 0.344 0.656
## --------------------------------------------------------------------------------
## work_type
## n missing distinct
## 5110 0 5
##
## lowest : children Govt_job Never_worked Private Self-employed
## highest: children Govt_job Never_worked Private Self-employed
##
## Value children Govt_job Never_worked Private
## Frequency 687 657 22 2925
## Proportion 0.134 0.129 0.004 0.572
##
## Value Self-employed
## Frequency 819
## Proportion 0.160
## --------------------------------------------------------------------------------
## Residence_type
## n missing distinct
## 5110 0 2
##
## Value Rural Urban
## Frequency 2514 2596
## Proportion 0.492 0.508
## --------------------------------------------------------------------------------
## avg_glucose_level
## n missing distinct Info Mean Gmd .05 .10
## 5110 0 3979 1 106.1 45.38 60.71 65.79
## .25 .50 .75 .90 .95
## 77.24 91.88 114.09 192.18 216.29
##
## lowest : 55.12 55.22 55.23 55.25 55.26, highest: 266.59 267.60 267.61 267.76 271.74
## --------------------------------------------------------------------------------
## bmi
## n missing distinct
## 5110 0 419
##
## lowest : 10.3 11.3 11.5 12 12.3, highest: 71.9 78 92 97.6 N/A
## --------------------------------------------------------------------------------
## smoking_status
## n missing distinct
## 5110 0 4
##
## Value formerly smoked never smoked smokes Unknown
## Frequency 885 1892 789 1544
## Proportion 0.173 0.370 0.154 0.302
## --------------------------------------------------------------------------------
## stroke
## n missing distinct Info Sum Mean Gmd
## 5110 0 2 0.139 249 0.04873 0.09273
##
## --------------------------------------------------------------------------------
Explanation : 1. There is more than 50% with gender female. 2. Mean of patients age is 43. 3. Most of patients does not have hypertension (based on mean). 4. Patients with heart disease is less than patients without heart disease. 5. Patients with married status reach 60%. 6. Most of patients have private work type. 7. There is slightly difference between sum of rural and urban 8. The least glucose level is 55.12 9. Highest bmi is 97.6 10. Most of patients come from people whose never smoked. 11. Most of patients does not have stroke.
#Check Missing Value
colSums(is.na(DataStroke))
## id gender age hypertension
## 0 0 0 0
## heart_disease ever_married work_type Residence_type
## 0 0 0 0
## avg_glucose_level bmi smoking_status stroke
## 0 0 0 0
Explanation : There are no missing value.
colSums(DataStroke == "N/A")
## id gender age hypertension
## 0 0 0 0
## heart_disease ever_married work_type Residence_type
## 0 0 0 0
## avg_glucose_level bmi smoking_status stroke
## 0 201 0 0
Explanation : There is 201 data filled with N/A in column ‘bmi’.
#Check Duplicate
sum(duplicated(DataStroke) == TRUE)
## [1] 0
Explanation : There is no data duplicated.
DataStroke$bmi <- as.numeric(DataStroke$bmi)
## Warning: NAs introduced by coercion
#Check Unique Values
rapply(DataStroke, function(x) length(unique(x)))
## id gender age hypertension
## 5110 3 104 2
## heart_disease ever_married work_type Residence_type
## 2 2 5 2
## avg_glucose_level bmi smoking_status stroke
## 3979 419 4 2
Explanation : 1. All patients have different id. 2. Use rapply to recursively apply all the data.
#Check Outliers
par(mfrow=c(1,3))
out <- boxplot.stats(DataStroke$age)$out
boxplot(DataStroke$age,
ylab = "Age (Year)",
main = "Boxplot of Age"
)
out1 <- boxplot.stats(DataStroke$bmi)$out
boxplot(DataStroke$bmi,
ylab = "",
main = "Boxplot of BMI"
)
out2 <- boxplot.stats(DataStroke$avg_glucose_level)$out
boxplot(DataStroke$avg_glucose_level,
ylab = "Glucose Level (mg/D)",
main = "Boxplot of Average Glucose Level"
)
length(out)
## [1] 0
length(out1)
## [1] 110
length(out2)
## [1] 627
Explanation : 1. There is no outliers in age. 2. There is 110 outliers in bmi, but there is posibility. 3. There is 627 outliers in average glucose level, but there is posibility.
##1. B. Data Preparation #Handle Missing value
DataStroke$id <- NULL
Explanation : All patients have different id, so id considered as meaningless information and we can drop it.
#Replace Missing value
DataStroke$bmi[is.na(DataStroke$bmi)] <- mean(DataStroke$bmi, na.rm = TRUE)
head(DataStroke)
## gender age hypertension heart_disease ever_married work_type
## 1 Male 67 0 1 Yes Private
## 2 Female 61 0 0 Yes Self-employed
## 3 Male 80 0 1 Yes Private
## 4 Female 49 0 0 Yes Private
## 5 Female 79 1 0 Yes Self-employed
## 6 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.60000 formerly smoked 1
## 2 Rural 202.21 28.89324 never smoked 1
## 3 Rural 105.92 32.50000 never smoked 1
## 4 Urban 171.23 34.40000 smokes 1
## 5 Rural 174.12 24.00000 never smoked 1
## 6 Urban 186.21 29.00000 formerly smoked 1
Explanation : 1. There is 201 data filled with N/A, so we use mean that does not change a lot of data.
#Convert variable to numeric (correlation)
DataStroke$gender <- as.numeric(DataStroke$gender)
DataStroke$ever_married <- as.numeric(DataStroke$ever_married)
DataStroke$smoking_status <- as.numeric(DataStroke$smoking_status)
DataStroke$Residence_type <- as.numeric(DataStroke$Residence_type)
DataStroke$work_type <- as.numeric(DataStroke$work_type)
str(DataStroke)
## 'data.frame': 5110 obs. of 11 variables:
## $ gender : num 2 1 2 1 1 2 2 1 1 1 ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : num 2 2 2 2 2 2 2 1 2 2 ...
## $ work_type : num 4 5 4 4 5 4 4 4 4 4 ...
## $ Residence_type : num 2 1 1 2 1 2 1 2 1 2 ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : num 36.6 28.9 32.5 34.4 24 ...
## $ smoking_status : num 1 2 2 3 2 1 2 2 4 4 ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
#Feature Selection There is 3 methods will be used, correlation, linear model and relative importance.
#Check Correlation
library(Hmisc)
newdata <- cor(DataStroke[sapply(DataStroke, is.numeric)])
corr <- rcorr(newdata, type = "spearman")
corr
## gender age hypertension heart_disease ever_married
## gender 1.00 -0.75 -0.34 0.07 -0.74
## age -0.75 1.00 0.72 0.47 0.96
## hypertension -0.34 0.72 1.00 0.54 0.78
## heart_disease 0.07 0.47 0.54 1.00 0.44
## ever_married -0.74 0.96 0.78 0.44 1.00
## work_type -0.75 0.96 0.70 0.42 0.95
## Residence_type -0.38 0.07 -0.33 0.01 -0.03
## avg_glucose_level -0.05 0.55 0.88 0.70 0.61
## bmi -0.72 0.85 0.71 0.27 0.92
## smoking_status 0.77 -0.95 -0.79 -0.42 -0.96
## stroke -0.10 0.50 0.57 0.90 0.43
## work_type Residence_type avg_glucose_level bmi
## gender -0.75 -0.38 -0.05 -0.72
## age 0.96 0.07 0.55 0.85
## hypertension 0.70 -0.33 0.88 0.71
## heart_disease 0.42 0.01 0.70 0.27
## ever_married 0.95 -0.03 0.61 0.92
## work_type 1.00 -0.07 0.54 0.87
## Residence_type -0.07 1.00 -0.30 -0.07
## avg_glucose_level 0.54 -0.30 1.00 0.59
## bmi 0.87 -0.07 0.59 1.00
## smoking_status -0.96 0.02 -0.61 -0.90
## stroke 0.42 0.19 0.67 0.33
## smoking_status stroke
## gender 0.77 -0.10
## age -0.95 0.50
## hypertension -0.79 0.57
## heart_disease -0.42 0.90
## ever_married -0.96 0.43
## work_type -0.96 0.42
## Residence_type 0.02 0.19
## avg_glucose_level -0.61 0.67
## bmi -0.90 0.33
## smoking_status 1.00 -0.47
## stroke -0.47 1.00
##
## n= 11
##
##
## P
## gender age hypertension heart_disease ever_married
## gender 0.0085 0.3118 0.8317 0.0098
## age 0.0085 0.0128 0.1420 0.0000
## hypertension 0.3118 0.0128 0.0890 0.0045
## heart_disease 0.8317 0.1420 0.0890 0.1797
## ever_married 0.0098 0.0000 0.0045 0.1797
## work_type 0.0085 0.0000 0.0165 0.2006 0.0000
## Residence_type 0.2466 0.8317 0.3259 0.9788 0.9366
## avg_glucose_level 0.8944 0.0767 0.0003 0.0165 0.0467
## bmi 0.0128 0.0008 0.0146 0.4171 0.0000
## smoking_status 0.0053 0.0000 0.0037 0.2006 0.0000
## stroke 0.7699 0.1173 0.0655 0.0002 0.1899
## work_type Residence_type avg_glucose_level bmi
## gender 0.0085 0.2466 0.8944 0.0128
## age 0.0000 0.8317 0.0767 0.0008
## hypertension 0.0165 0.3259 0.0003 0.0146
## heart_disease 0.2006 0.9788 0.0165 0.4171
## ever_married 0.0000 0.9366 0.0467 0.0000
## work_type 0.8317 0.0890 0.0005
## Residence_type 0.8317 0.3701 0.8317
## avg_glucose_level 0.0890 0.3701 0.0556
## bmi 0.0005 0.8317 0.0556
## smoking_status 0.0000 0.9577 0.0467 0.0002
## stroke 0.2006 0.5739 0.0233 0.3259
## smoking_status stroke
## gender 0.0053 0.7699
## age 0.0000 0.1173
## hypertension 0.0037 0.0655
## heart_disease 0.2006 0.0002
## ever_married 0.0000 0.1899
## work_type 0.0000 0.2006
## Residence_type 0.9577 0.5739
## avg_glucose_level 0.0467 0.0233
## bmi 0.0002 0.3259
## smoking_status 0.1420
## stroke 0.1420
Explanation : Variable has the high correlation with stroke : 1. Heart_disease 2. Avg_glucose_level 3. Hypertension 4. Age 5. Smoking_status 6. Ever_married
#Use Linear Model
lm = lm(stroke~., data = DataStroke)
summary(lm)
##
## Call:
## lm(formula = stroke ~ ., data = DataStroke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29470 -0.07916 -0.02619 0.00586 1.03843
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.380e-02 2.268e-02 -0.608 0.5430
## gender 3.245e-04 5.913e-03 0.055 0.9562
## age 2.811e-03 2.026e-04 13.871 < 2e-16 ***
## hypertension 4.001e-02 1.026e-02 3.901 9.71e-05 ***
## heart_disease 5.345e-02 1.347e-02 3.969 7.31e-05 ***
## ever_married -3.849e-02 8.466e-03 -4.547 5.57e-06 ***
## work_type -6.862e-03 2.788e-03 -2.462 0.0139 *
## Residence_type 5.256e-03 5.783e-03 0.909 0.3635
## avg_glucose_level 3.190e-04 6.703e-05 4.759 2.00e-06 ***
## bmi -9.537e-04 4.156e-04 -2.295 0.0218 *
## smoking_status 2.821e-03 2.917e-03 0.967 0.3336
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2066 on 5099 degrees of freedom
## Multiple R-squared: 0.08114, Adjusted R-squared: 0.07934
## F-statistic: 45.03 on 10 and 5099 DF, p-value: < 2.2e-16
Explanation : Variable has high correlation with stroke : 1. Age 2. Hypertension 3. Heart_disease 4. Ever_marriage 5. Avg_glucose_level
#Use Relative Important
library(relaimpo)
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:survival':
##
## aml
## The following object is masked from 'package:lattice':
##
## melanoma
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'survey'
## The following object is masked from 'package:Hmisc':
##
## deff
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
imp <- calc.relimp(lm, type = "lmg", rela = F)
sort(round(imp$lmg, 4), decreasing = FALSE)
## gender Residence_type bmi smoking_status
## 0.0001 0.0002 0.0009 0.0012
## work_type ever_married hypertension avg_glucose_level
## 0.0027 0.0058 0.0077 0.0088
## heart_disease age
## 0.0090 0.0446
Explanation : variable with high correlation with stroke : 1. Age 2. Heart_disease 3. Avg_glucose_level 4. Hypertension 5. Ever_married
In conclusion, the 5 most important feature analyzed by 3 three methods is : Age, Heart_disease, Avg_glucose_level, Hypertension and Ever_married.
##1. C. Modelling # Split data to train and test
set.seed(100)
train <- createDataPartition(DataStroke$stroke, p = 0.8, list=FALSE)
testing = DataStroke[-train,]
training = DataStroke[train,]
Explanation : Use 80% of data to training and 20% for testing
#Convert to factor
DataStroke$gender <- as.factor(DataStroke$gender)
DataStroke$ever_married <- as.factor(DataStroke$ever_married)
DataStroke$smoking_status <- as.factor(DataStroke$smoking_status)
DataStroke$Residence_type <- as.factor(DataStroke$Residence_type)
DataStroke$work_type <- as.factor(DataStroke$work_type)
str(DataStroke)
## 'data.frame': 5110 obs. of 11 variables:
## $ gender : Factor w/ 3 levels "1","2","3": 2 1 2 1 1 2 2 1 1 1 ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 1 2 2 ...
## $ work_type : Factor w/ 5 levels "1","2","3","4",..: 4 5 4 4 5 4 4 4 4 4 ...
## $ Residence_type : Factor w/ 2 levels "1","2": 2 1 1 2 1 2 1 2 1 2 ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : num 36.6 28.9 32.5 34.4 24 ...
## $ smoking_status : Factor w/ 4 levels "1","2","3","4": 1 2 2 3 2 1 2 2 4 4 ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
#Check Dimension
dim(training)
## [1] 4088 11
dim(testing)
## [1] 1022 11
Explanation : For training there is 4088 rows and 11 columns. And for testing there is 1022 rows and 11 columns.
#Modelling
mod <- glm(stroke~., family = binomial(link = "logit"), data = training)
summary(mod)
##
## Call:
## glm(formula = stroke ~ ., family = binomial(link = "logit"),
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1774 -0.3109 -0.1645 -0.0823 3.7025
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.2931018 0.8181787 -8.914 < 2e-16 ***
## gender 0.0795525 0.1580732 0.503 0.61478
## age 0.0745306 0.0061146 12.189 < 2e-16 ***
## hypertension 0.3517248 0.1902730 1.849 0.06453 .
## heart_disease 0.3470843 0.2123888 1.634 0.10222
## ever_married -0.2991198 0.2358839 -1.268 0.20477
## work_type -0.0848952 0.0802336 -1.058 0.29001
## Residence_type 0.0694958 0.1555912 0.447 0.65512
## avg_glucose_level 0.0038651 0.0013476 2.868 0.00413 **
## bmi -0.0001952 0.0127981 -0.015 0.98783
## smoking_status 0.0601813 0.0752610 0.800 0.42392
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1573.2 on 4087 degrees of freedom
## Residual deviance: 1244.9 on 4077 degrees of freedom
## AIC: 1266.9
##
## Number of Fisher Scoring iterations: 7
Explanation : Only age and avg_glucose_level are statistically significant.
mod1 <- glm(stroke~age+avg_glucose_level, family = binomial(link = "logit"), data = training)
summary(mod1)
##
## Call:
## glm(formula = stroke ~ age + avg_glucose_level, family = binomial(link = "logit"),
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9502 -0.3205 -0.1681 -0.0765 3.8440
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.791570 0.408368 -19.08 < 2e-16 ***
## age 0.074784 0.005765 12.97 < 2e-16 ***
## avg_glucose_level 0.004338 0.001283 3.38 0.000725 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1573.2 on 4087 degrees of freedom
## Residual deviance: 1255.0 on 4085 degrees of freedom
## AIC: 1261
##
## Number of Fisher Scoring iterations: 7
Explanation : 1. Second model have lower p value (far from 0.05), means second model is better. 2. AIC second model is lower, shows that second model is better.
#Prediction
predictmod <- predict(mod1, newdata = subset(testing, select = c(1:10)), type = "response")
pred <- prediction(predictmod, testing$stroke)
roc <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc)
Explanation : ROC Curve shows good curve, as the curve always increase
and close to 1 (Perfect curve).
#AUC
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8198882
Explanation : Value of AUC is close to 1 (perfect curve). AUC is area of ROC curve, so if it close to 1, then it is close to perfect curve.
#Check Accuracy
res <- ifelse(predictmod > 0.5 , 1, 0)
misclassification_error <- mean(res != testing$stroke)
print(paste("Accuracy : ", 1 - misclassification_error))
## [1] "Accuracy : 0.948140900195695"
Explanation : Accuracy of prediction is quite high, 94,8%.
res1 <- ifelse(predictmod > 0.5, 1, 0)
misclassification_error1 <- mean(res != training$stroke)
print(paste("Accuracy Training : ", 1 - misclassification_error1))
## [1] "Accuracy Training : 0.952054794520548"
Explanation : 1. Accuracy of prediction using training is 95,2% 2. The model is not because the difference of those accuracy is not high (arround 0.4%)
DTree <- rpart(stroke~., training, method = "class", cp = .0001)
DTree
## n= 4088
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4088 196 0 (0.952054795 0.047945205)
## 2) age< 67.5 3398 77 0 (0.977339612 0.022660388)
## 4) age< 53.5 2600 21 0 (0.991923077 0.008076923) *
## 5) age>=53.5 798 56 0 (0.929824561 0.070175439)
## 10) avg_glucose_level< 110.47 531 22 0 (0.958568738 0.041431262) *
## 11) avg_glucose_level>=110.47 267 34 0 (0.872659176 0.127340824)
## 22) heart_disease< 0.5 235 25 0 (0.893617021 0.106382979)
## 44) bmi>=28.89662 162 12 0 (0.925925926 0.074074074) *
## 45) bmi< 28.89662 73 13 0 (0.821917808 0.178082192)
## 90) bmi< 27.2 37 2 0 (0.945945946 0.054054054) *
## 91) bmi>=27.2 36 11 0 (0.694444444 0.305555556)
## 182) avg_glucose_level>=213.85 9 1 0 (0.888888889 0.111111111) *
## 183) avg_glucose_level< 213.85 27 10 0 (0.629629630 0.370370370)
## 366) smoking_status>=1.5 20 6 0 (0.700000000 0.300000000)
## 732) work_type>=3 13 2 0 (0.846153846 0.153846154) *
## 733) work_type< 3 7 3 1 (0.428571429 0.571428571) *
## 367) smoking_status< 1.5 7 3 1 (0.428571429 0.571428571) *
## 23) heart_disease>=0.5 32 9 0 (0.718750000 0.281250000)
## 46) smoking_status< 2.5 22 3 0 (0.863636364 0.136363636) *
## 47) smoking_status>=2.5 10 4 1 (0.400000000 0.600000000) *
## 3) age>=67.5 690 119 0 (0.827536232 0.172463768)
## 6) age< 73.5 237 27 0 (0.886075949 0.113924051)
## 12) ever_married>=1.5 217 21 0 (0.903225806 0.096774194) *
## 13) ever_married< 1.5 20 6 0 (0.700000000 0.300000000)
## 26) hypertension< 0.5 13 2 0 (0.846153846 0.153846154) *
## 27) hypertension>=0.5 7 3 1 (0.428571429 0.571428571) *
## 7) age>=73.5 453 92 0 (0.796909492 0.203090508)
## 14) bmi>=33.75 64 5 0 (0.921875000 0.078125000) *
## 15) bmi< 33.75 389 87 0 (0.776349614 0.223650386)
## 30) bmi< 28.84662 242 45 0 (0.814049587 0.185950413)
## 60) hypertension< 0.5 198 30 0 (0.848484848 0.151515152) *
## 61) hypertension>=0.5 44 15 0 (0.659090909 0.340909091)
## 122) avg_glucose_level< 150.23 31 7 0 (0.774193548 0.225806452) *
## 123) avg_glucose_level>=150.23 13 5 1 (0.384615385 0.615384615) *
## 31) bmi>=28.84662 147 42 0 (0.714285714 0.285714286)
## 62) bmi>=28.89662 115 24 0 (0.791304348 0.208695652)
## 124) smoking_status>=3.5 20 1 0 (0.950000000 0.050000000) *
## 125) smoking_status< 3.5 95 23 0 (0.757894737 0.242105263)
## 250) bmi< 30.95 43 7 0 (0.837209302 0.162790698) *
## 251) bmi>=30.95 52 16 0 (0.692307692 0.307692308)
## 502) avg_glucose_level< 87.105 16 2 0 (0.875000000 0.125000000) *
## 503) avg_glucose_level>=87.105 36 14 0 (0.611111111 0.388888889)
## 1006) avg_glucose_level>=108.405 27 8 0 (0.703703704 0.296296296) *
## 1007) avg_glucose_level< 108.405 9 3 1 (0.333333333 0.666666667) *
## 63) bmi< 28.89662 32 14 1 (0.437500000 0.562500000)
## 126) heart_disease>=0.5 11 4 0 (0.636363636 0.363636364) *
## 127) heart_disease< 0.5 21 7 1 (0.333333333 0.666666667)
## 254) hypertension>=0.5 7 2 0 (0.714285714 0.285714286) *
## 255) hypertension< 0.5 14 2 1 (0.142857143 0.857142857) *
Explanation : Cp (complexity parameter) : The lower cp the more nodes & branches can be seen.
rpart.plot(DTree, type = 1, fallen.leaves = FALSE, tweak = 1.2)
Explanation : Making prediction based on data.
#Prediction Decision Tree
predicttree <- predict(DTree, testing, type = "class")
Explanation : Prediction value of decision tree.
#Confusion Matrix
table <- table(predicttree, testing$stroke)
table
##
## predicttree 0 1
## 0 954 47
## 1 15 6
Explanation : 1. Column represent prediction value 2. Row represent actual value 3. TP = 954, FN = 15, FP = 47 TN = 6
#Overall Accuracy
overall <- sum(diag(table))/sum(table)
overall
## [1] 0.9393346
Explanation : The model have overall accuracy 93,9%.
#Misclassification Rate / Error Rate
error <- 1 - overall
error
## [1] 0.06066536
Explanation : The model have error rate 6 %.