Video Link : https://binusianorg-my.sharepoint.com/personal/lionel_riyadi_binus_ac_id/_layouts/15/guestaccess.aspx?guestaccesstoken=bUvHVzwEQD4wDsoeOBtwmvu7pz9p9WhkM73hbpugCLw%3D&docid=2_05a8d21e706af4cecbaee51cb06634d40&rev=1&e=eI5yXz

#Import Library needed

library(ROCR)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(visdat) 
library(reshape2) 
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(ggplot2)
library(DataExplorer)
library(caTools)

First, we import library that we needs

df <- read.csv("StrokeData.csv", stringsAsFactors= TRUE)
head(df)
##      id gender age hypertension heart_disease ever_married     work_type
## 1  9046   Male  67            0             1          Yes       Private
## 2 51676 Female  61            0             0          Yes Self-employed
## 3 31112   Male  80            0             1          Yes       Private
## 4 60182 Female  49            0             0          Yes       Private
## 5  1665 Female  79            1             0          Yes Self-employed
## 6 56669   Male  81            0             0          Yes       Private
##   Residence_type avg_glucose_level  bmi  smoking_status stroke
## 1          Urban            228.69 36.6 formerly smoked      1
## 2          Rural            202.21  N/A    never smoked      1
## 3          Rural            105.92 32.5    never smoked      1
## 4          Urban            171.23 34.4          smokes      1
## 5          Rural            174.12   24    never smoked      1
## 6          Urban            186.21   29 formerly smoked      1
str(df)
## 'data.frame':    5110 obs. of  12 variables:
##  $ id               : int  9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
##  $ gender           : Factor w/ 3 levels "Female","Male",..: 2 1 2 1 1 2 2 1 1 1 ...
##  $ age              : num  67 61 80 49 79 81 74 69 59 78 ...
##  $ hypertension     : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ heart_disease    : int  1 0 1 0 0 0 1 0 0 0 ...
##  $ ever_married     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
##  $ work_type        : Factor w/ 5 levels "children","Govt_job",..: 4 5 4 4 5 4 4 4 4 4 ...
##  $ Residence_type   : Factor w/ 2 levels "Rural","Urban": 2 1 1 2 1 2 1 2 1 2 ...
##  $ avg_glucose_level: num  229 202 106 171 174 ...
##  $ bmi              : Factor w/ 419 levels "10.3","11.3",..: 240 419 199 218 114 164 148 102 419 116 ...
##  $ smoking_status   : Factor w/ 4 levels "formerly smoked",..: 1 2 2 3 2 1 2 2 4 4 ...
##  $ stroke           : int  1 1 1 1 1 1 1 1 1 1 ...
levels(df$gender)
## [1] "Female" "Male"   "Other"

#Attribut Id = unique data gender = Male, Female, Other age = age in years hypertension = (1 = have hypertension , 0 = otherwise) heart_disease = (1 = have heart_disease, 0 = otherwise) ever_married = (1 = yes, 0 = no) work_type = children, Govt_job, Never_worked, Private, Self_employed Residence_type = Rural, Urban avg_glucose_level= average glucose patient bmi = body mass index of the patient smoking_status = formerly smoked, never smoked, smokes, Unknown stroke = (1 = have stroke, 0 = doesn’t have stroke) #Explorasi Data Analysis

dim(df)
## [1] 5110   12

#Checking dimension

summary(df)
##        id           gender          age         hypertension    
##  Min.   :   67   Female:2994   Min.   : 0.08   Min.   :0.00000  
##  1st Qu.:17741   Male  :2115   1st Qu.:25.00   1st Qu.:0.00000  
##  Median :36932   Other :   1   Median :45.00   Median :0.00000  
##  Mean   :36518                 Mean   :43.23   Mean   :0.09746  
##  3rd Qu.:54682                 3rd Qu.:61.00   3rd Qu.:0.00000  
##  Max.   :72940                 Max.   :82.00   Max.   :1.00000  
##                                                                 
##  heart_disease     ever_married         work_type    Residence_type
##  Min.   :0.00000   No :1757     children     : 687   Rural:2514    
##  1st Qu.:0.00000   Yes:3353     Govt_job     : 657   Urban:2596    
##  Median :0.00000                Never_worked :  22                 
##  Mean   :0.05401                Private      :2925                 
##  3rd Qu.:0.00000                Self-employed: 819                 
##  Max.   :1.00000                                                   
##                                                                    
##  avg_glucose_level      bmi               smoking_status     stroke       
##  Min.   : 55.12    N/A    : 201   formerly smoked: 885   Min.   :0.00000  
##  1st Qu.: 77.25    28.7   :  41   never smoked   :1892   1st Qu.:0.00000  
##  Median : 91.89    28.4   :  38   smokes         : 789   Median :0.00000  
##  Mean   :106.15    26.1   :  37   Unknown        :1544   Mean   :0.04873  
##  3rd Qu.:114.09    26.7   :  37                          3rd Qu.:0.00000  
##  Max.   :271.74    27.6   :  37                          Max.   :1.00000  
##                    (Other):4719

In this summary dataset, we can see that bmi has 201 missing value. instead of removing it, we can use the bmi avg to fill the missing value.

#Changing N/A value in bmi column to the bmi average

df$bmi = as.character(df$bmi)
df$bmi[df$bmi == "N/A"]<- "0"
df$bmi= as.integer(df$bmi)
df$bmi[df$bmi == 0]<-mean(df$bmi)
df$bmi[is.na(df$bmi)]=0
head(df)
##      id gender age hypertension heart_disease ever_married     work_type
## 1  9046   Male  67            0             1          Yes       Private
## 2 51676 Female  61            0             0          Yes Self-employed
## 3 31112   Male  80            0             1          Yes       Private
## 4 60182 Female  49            0             0          Yes       Private
## 5  1665 Female  79            1             0          Yes Self-employed
## 6 56669   Male  81            0             0          Yes       Private
##   Residence_type avg_glucose_level      bmi  smoking_status stroke
## 1          Urban            228.69 36.00000 formerly smoked      1
## 2          Rural            202.21 27.32955    never smoked      1
## 3          Rural            105.92 32.00000    never smoked      1
## 4          Urban            171.23 34.00000          smokes      1
## 5          Rural            174.12 24.00000    never smoked      1
## 6          Urban            186.21 29.00000 formerly smoked      1
hist(df$avg_glucose_level)

hist(df$bmi)

Before we get continue, let’s find the outlier for avg glucose and bmi #Checking outlier using Three Sigma, Hampel, and Boxplot Rule

#three sigma
x<- df$avg_glucose_level
t <- 3
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
up <- mu + t * sig
down<- mu - t * sig
#hampel
a<-df$avg_glucose_level
mus <- median(a, na.rm = TRUE)
sigs <- mad(a, na.rm = TRUE)
ups <- mu + t * sigs
downs <- mu - t * sigs
#boxplot rule
t <- 1.5
x<- df$avg_glucose_level
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
upv <- xU + t * Q
downv <- xU - t * Q


par(mfrow=c(1,1))
plot(df$avg_glucose_level,main ="Three Sigma edit rule")
abline(h = up,lty=2,lwd=1, col = "red")
abline(h = down, lty = 2, lwd =1, col = "red")

par(mfrow=c(1,1))
plot(df$avg_glucose_level,main = "Hampel identifier")
abline(h = ups, lty = 2, lwd =1, col = "red")
abline(h = downs, lty = 2, lwd =1,col="red")

par(mfrow=c(1,1))
plot(df$avg_glucose_level,main = "Boxplot Rule")
abline(h = upv,lty=2,lwd=1, col = "red")
abline(h = downv, lty = 2, lwd =1, col = "red")

cat("Three sigma rule\n")
## Three sigma rule
cat("UP : ")
## UP :
df$avg_glucose_level[which(df$avg_glucose_level > up)]
##  [1] 252.72 243.58 259.63 249.31 263.32 271.74 242.52 250.89 247.51 243.53
## [11] 242.30 243.50 251.60 247.69 250.20 254.60 254.63 246.34 251.46 267.76
## [21] 246.53 244.28 251.99 253.16 242.84 249.29 242.94 247.48 266.59 243.73
## [31] 243.59 250.80 255.17 267.61 260.85 248.37 263.56 247.97 248.24 253.93
## [41] 254.95 247.87 261.67 256.74 244.30 242.62 243.52 267.60 253.86
cat("Down : ")
## Down :
df$avg_glucose_level[which(df$avg_glucose_level < down)]
## numeric(0)
cat("\nHampeL identifier:\n")
## 
## HampeL identifier:
cat("Up : ")
## Up :
df$avg_glucose_level[which(df$avg_glucose_level > ups)]
##   [1] 228.69 202.21 186.21 219.84 214.09 191.61 221.29 217.08 193.94 233.29
##  [11] 228.70 208.30 189.84 195.23 211.78 212.08 196.92 252.72 219.72 213.03
##  [21] 243.58 197.54 196.71 237.75 194.99 185.17 221.58 228.56 240.09 226.98
##  [31] 235.63 240.59 190.32 231.61 191.82 224.10 216.94 259.63 249.31 219.91
##  [41] 200.59 190.14 206.09 263.32 207.28 194.37 199.20 221.79 239.07 223.83
##  [51] 231.56 221.89 195.71 203.87 185.49 213.22 215.94 209.86 205.77 271.74
##  [61] 200.62 242.52 208.65 205.33 210.40 199.86 219.73 250.89 205.35 216.58
##  [71] 184.40 199.84 218.46 211.06 197.28 233.94 247.51 210.95 243.53 205.84
##  [81] 198.21 206.72 214.45 190.70 203.04 242.30 220.49 218.46 216.70 234.58
##  [91] 235.85 243.50 229.92 215.60 239.64 200.28 205.23 209.58 210.78 251.60
## [101] 213.37 223.36 203.81 205.26 211.03 225.47 227.10 201.76 217.30 196.01
## [111] 198.69 186.17 210.48 193.83 247.69 191.47 239.82 189.57 207.58 215.64
## [121] 196.36 188.11 205.50 204.86 228.08 219.53 219.97 214.05 200.49 240.71
## [131] 197.10 194.62 222.21 250.20 254.60 212.01 186.45 189.49 186.32 226.70
## [141] 194.04 237.15 231.19 207.32 207.64 236.84 204.63 232.89 195.03 227.91
## [151] 204.50 206.25 254.63 246.34 195.16 223.68 229.20 193.22 204.57 251.46
## [161] 220.52 195.04 218.65 211.49 224.71 226.11 210.94 230.68 198.02 204.17
## [171] 267.76 217.71 239.52 229.86 210.96 195.25 217.39 201.25 197.79 214.77
## [181] 189.45 206.40 197.58 199.96 205.77 237.21 246.53 206.33 206.98 227.28
## [191] 228.70 244.28 251.99 191.79 216.88 222.29 213.11 227.51 201.01 210.00
## [201] 237.58 207.45 226.93 253.16 238.53 207.79 196.20 231.76 216.92 194.98
## [211] 218.54 237.17 197.09 242.84 202.66 216.90 210.00 208.05 222.60 199.14
## [221] 191.48 200.16 190.40 215.90 233.52 213.54 188.69 219.50 217.66 227.16
## [231] 209.90 211.88 225.60 210.23 234.82 230.59 224.63 185.71 208.17 185.31
## [241] 203.04 187.87 213.87 222.85 198.36 196.25 194.53 204.05 199.18 209.26
## [251] 217.11 222.46 187.52 237.74 223.35 201.07 208.06 186.95 198.24 229.21
## [261] 209.06 228.42 212.97 202.05 206.25 231.69 219.96 197.69 199.88 208.78
## [271] 222.29 220.36 187.88 191.66 217.75 226.88 186.40 203.81 189.44 249.29
## [281] 211.35 206.59 196.33 242.94 226.75 185.00 199.83 227.81 240.81 239.28
## [291] 231.50 192.37 220.47 196.91 247.48 216.00 219.39 220.47 198.33 191.33
## [301] 206.52 216.96 232.81 207.95 229.58 187.22 227.04 214.42 233.71 216.40
## [311] 266.59 227.94 205.00 203.44 243.73 200.28 221.43 213.38 192.16 215.72
## [321] 202.57 209.50 203.16 201.45 206.15 196.61 219.92 231.95 216.38 213.33
## [331] 243.59 227.98 208.20 199.42 190.13 235.54 227.74 213.80 250.80 217.84
## [341] 217.00 217.40 190.92 255.17 217.55 227.96 231.71 196.81 222.66 223.58
## [351] 198.79 192.39 233.30 201.38 236.14 193.81 239.95 202.21 198.79 202.55
## [361] 232.12 203.57 230.78 204.98 227.89 216.71 202.67 221.80 202.38 215.81
## [371] 220.24 195.61 267.61 207.62 201.58 231.43 220.26 211.12 215.33 212.02
## [381] 228.20 260.85 223.90 207.96 205.01 191.78 214.43 220.64 204.77 248.37
## [391] 194.53 228.92 227.68 226.73 219.17 215.92 198.12 240.86 263.56 200.14
## [401] 235.45 207.71 228.05 223.14 214.51 231.31 238.78 233.59 188.13 205.97
## [411] 190.89 193.87 214.77 189.88 197.11 192.47 199.38 202.98 198.32 226.38
## [421] 236.79 219.82 239.19 206.62 216.88 204.92 226.84 234.35 200.73 202.51
## [431] 218.00 209.15 202.66 196.50 209.50 219.81 205.23 234.27 239.21 196.08
## [441] 193.45 219.38 217.94 216.64 208.85 219.70 208.05 185.28 198.30 206.66
## [451] 200.68 218.60 223.26 221.83 218.10 200.46 217.79 233.47 200.98 219.67
## [461] 207.60 247.97 231.15 186.54 221.06 212.62 217.74 208.99 197.36 222.52
## [471] 232.64 207.37 201.96 213.43 248.24 229.94 202.06 253.93 194.75 207.84
## [481] 228.26 203.76 205.78 230.74 216.19 200.66 228.50 232.29 200.91 236.04
## [491] 254.95 196.58 189.82 193.61 195.74 221.24 192.50 212.92 191.94 247.87
## [501] 229.73 261.67 256.74 221.08 208.39 227.23 203.27 234.50 190.67 197.06
## [511] 216.07 203.87 235.06 195.43 200.25 223.64 199.78 244.30 223.16 226.28
## [521] 213.92 212.19 200.80 222.58 206.53 232.78 187.47 234.06 242.62 231.54
## [531] 219.80 187.99 234.45 240.69 217.57 234.51 214.73 208.69 231.72 206.53
## [541] 193.80 203.01 198.84 243.52 238.27 208.31 211.83 215.69 267.60 215.07
## [551] 225.35 196.26 212.87 185.27 206.49 253.86 203.36 191.15 223.78 211.58
## [561] 193.88
cat("Down: ")
## Down:
df$avg_glucose_level[which(df$avg_glucose_level < downs)]
## numeric(0)
cat("\nBoxplot rule\n")
## 
## Boxplot rule
cat("up :")
## up :
df$avg_glucose_level[which(df$avg_glucose_level > upv)]
##   [1] 228.69 202.21 171.23 174.12 186.21 219.84 214.09 191.61 221.29 217.08
##  [11] 193.94 233.29 228.70 208.30 189.84 195.23 211.78 212.08 196.92 252.72
##  [21] 219.72 213.03 243.58 197.54 196.71 237.75 194.99 180.93 185.17 221.58
##  [31] 179.12 228.56 240.09 226.98 235.63 240.59 190.32 231.61 191.82 224.10
##  [41] 216.94 259.63 249.31 219.91 200.59 190.14 182.99 206.09 263.32 207.28
##  [51] 194.37 199.20 221.79 239.07 169.67 223.83 231.56 221.89 195.71 203.87
##  [61] 185.49 213.22 215.94 209.86 205.77 271.74 200.62 242.52 175.29 208.65
##  [71] 205.33 210.40 199.86 219.73 250.89 205.35 216.58 184.40 199.84 218.46
##  [81] 211.06 197.28 233.94 247.51 210.95 243.53 205.84 198.21 206.72 214.45
##  [91] 190.70 203.04 242.30 220.49 218.46 216.70 234.58 235.85 243.50 182.20
## [101] 229.92 215.60 239.64 200.28 205.23 209.58 210.78 251.60 213.37 223.36
## [111] 178.29 203.81 205.26 211.03 225.47 180.63 227.10 201.76 170.05 217.30
## [121] 196.01 184.15 198.69 186.17 183.45 210.48 193.83 183.34 247.69 191.47
## [131] 239.82 189.57 207.58 182.86 215.64 196.36 188.11 205.50 204.86 228.08
## [141] 219.53 219.97 214.05 200.49 240.71 197.10 194.62 222.21 250.20 173.43
## [151] 184.25 254.60 212.01 186.45 189.49 186.32 226.70 183.10 194.04 237.15
## [161] 231.19 207.32 207.64 236.84 204.63 232.89 195.03 170.95 227.91 204.50
## [171] 206.25 254.63 246.34 195.16 223.68 229.20 193.22 204.57 251.46 220.52
## [181] 195.04 218.65 211.49 224.71 226.11 210.94 230.68 198.02 204.17 267.76
## [191] 217.71 180.76 239.52 229.86 210.96 195.25 217.39 201.25 197.79 214.77
## [201] 181.23 189.45 206.40 178.76 197.58 199.96 205.77 237.21 246.53 206.33
## [211] 206.98 227.28 228.70 169.97 244.28 251.99 191.79 216.88 222.29 213.11
## [221] 227.51 201.01 210.00 237.58 207.45 226.93 253.16 238.53 207.79 196.20
## [231] 231.76 216.92 194.98 218.54 183.00 237.17 178.33 197.09 242.84 202.66
## [241] 216.90 210.00 208.05 222.60 199.14 191.48 200.16 190.40 215.90 233.52
## [251] 213.54 188.69 219.50 217.66 227.16 209.90 176.48 211.88 225.60 210.23
## [261] 234.82 230.59 224.63 185.71 208.17 185.31 203.04 187.87 213.87 222.85
## [271] 198.36 196.25 194.53 204.05 199.18 209.26 217.11 222.46 187.52 237.74
## [281] 223.35 201.07 208.06 186.95 198.24 229.21 209.06 228.42 212.97 202.05
## [291] 206.25 231.69 219.96 197.69 199.88 170.22 208.78 222.29 220.36 187.88
## [301] 191.66 217.75 226.88 186.40 169.49 203.81 170.76 189.44 249.29 211.35
## [311] 206.59 196.33 242.94 226.75 185.00 199.83 227.81 240.81 239.28 231.50
## [321] 192.37 220.47 196.91 180.80 247.48 216.00 219.39 220.47 173.96 198.33
## [331] 191.33 206.52 216.96 170.93 232.81 207.95 229.58 187.22 227.04 214.42
## [341] 233.71 216.40 266.59 227.94 205.00 203.44 243.73 176.25 200.28 221.43
## [351] 213.38 192.16 215.72 173.14 202.57 209.50 203.16 201.45 206.15 196.61
## [361] 219.92 231.95 216.38 213.33 172.33 243.59 169.43 183.87 227.98 208.20
## [371] 199.42 190.13 235.54 178.89 227.74 213.80 250.80 217.84 217.00 217.40
## [381] 190.92 182.90 255.17 217.55 227.96 231.71 196.81 222.66 223.58 198.79
## [391] 192.39 233.30 201.38 236.14 193.81 239.95 170.88 202.21 181.30 198.79
## [401] 202.55 232.12 203.57 230.78 204.98 227.89 216.71 202.67 221.80 202.38
## [411] 215.81 220.24 195.61 267.61 176.71 207.62 201.58 231.43 220.26 211.12
## [421] 177.91 215.33 212.02 228.20 260.85 223.90 169.74 207.96 176.78 205.01
## [431] 191.78 214.43 220.64 204.77 248.37 194.53 228.92 227.68 226.73 219.17
## [441] 215.92 198.12 240.86 263.56 200.14 235.45 207.71 228.05 223.14 174.43
## [451] 214.51 231.31 238.78 233.59 188.13 205.97 190.89 193.87 214.77 189.88
## [461] 197.11 192.47 199.38 202.98 198.32 226.38 236.79 219.82 239.19 206.62
## [471] 216.88 204.92 226.84 234.35 200.73 202.51 218.00 209.15 202.66 196.50
## [481] 209.50 219.81 205.23 234.27 239.21 196.08 176.38 175.74 193.45 180.45
## [491] 219.38 173.90 217.94 216.64 173.97 208.85 219.70 208.05 185.28 198.30
## [501] 206.66 200.68 218.60 223.26 172.27 221.83 218.10 200.46 217.79 233.47
## [511] 181.23 200.98 219.67 207.60 247.97 231.15 186.54 221.06 212.62 217.74
## [521] 208.99 197.36 222.52 232.64 207.37 201.96 213.43 248.24 229.94 202.06
## [531] 253.93 194.75 207.84 228.26 203.76 205.78 179.67 230.74 216.19 200.66
## [541] 228.50 232.29 200.91 236.04 254.95 196.58 189.82 193.61 195.74 221.24
## [551] 192.50 212.92 191.94 247.87 229.73 261.67 256.74 221.08 208.39 227.23
## [561] 203.27 234.50 190.67 197.06 216.07 179.14 203.87 235.06 195.43 200.25
## [571] 223.64 199.78 176.42 244.30 223.16 226.28 172.86 213.92 212.19 200.80
## [581] 222.58 206.53 232.78 187.47 234.06 242.62 174.54 231.54 219.80 187.99
## [591] 234.45 240.69 217.57 234.51 182.22 214.73 208.69 231.72 206.53 193.80
## [601] 203.01 177.56 198.84 243.52 238.27 208.31 176.34 211.83 215.69 267.60
## [611] 215.07 225.35 196.26 182.52 212.87 183.43 185.27 206.49 253.86 203.36
## [621] 175.92 191.15 223.78 211.58 179.38 193.88 174.37
cat("down : ")
## down :
df$avg_glucose_level[which(df$avg_glucose_level < downv)]
##   [1] 58.57 58.09 57.92 56.11 57.93 57.08 57.33 55.78 55.42 56.96 58.63 55.39
##  [13] 58.26 58.37 58.65 56.11 57.27 55.32 57.57 56.23 56.75 55.25 56.18 55.34
##  [25] 55.83 55.26 55.27 58.47 56.77 56.99 57.51 56.31 57.82 57.94 58.39 57.93
##  [37] 56.42 58.38 56.48 56.07 55.86 56.30 57.26 56.47 58.03 55.67 57.02 55.22
##  [49] 57.89 57.37 56.07 58.48 58.30 58.71 55.51 58.29 56.63 57.33 57.95 58.25
##  [61] 58.02 56.84 58.69 56.67 58.66 56.30 56.43 57.92 56.51 55.41 57.02 55.57
##  [73] 58.14 55.34 55.61 58.70 56.90 57.30 58.51 56.34 55.62 56.71 55.58 56.12
##  [85] 55.46 58.63 57.43 58.41 56.89 57.47 57.76 58.65 57.51 56.12 57.02 58.35
##  [97] 57.28 58.42 56.37 56.11 55.64 56.08 56.33 56.95 57.77 57.17 57.80 56.42
## [109] 55.59 55.96 56.75 57.57 58.19 55.72 56.74 55.28 56.32 55.47 56.21 56.08
## [121] 57.42 58.01 56.85 58.24 56.79 57.60 56.13 58.55 57.59 55.23 55.35 57.46
## [133] 57.96 56.33 56.90 57.79 58.01 57.38 56.94 57.09 56.48 56.64 55.79 55.93
## [145] 58.66 57.40 57.56 57.56 58.42 57.52 58.23 58.64 56.11 58.19 57.10 57.28
## [157] 58.81 58.35 57.83 57.15 55.12 56.87 58.39 57.42 57.06 56.85 55.84 58.72

In Average Glucose Level detect alot of outlier which almost up to 25%

#three sigma
x<- df$bmi
t <- 3
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
up <- mu + t * sig
down<- mu - t * sig
#hampel
a<-df$bmi
mus <- median(a, na.rm = TRUE)
sigs <- mad(a, na.rm = TRUE)
ups <- mu + t * sigs
downs <- mu - t * sigs
#boxplot rule
t <- 1.5
x<- df$bmi
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
upv <- xU + t * Q
downv <- xU - t * Q


par(mfrow=c(1,1))
plot(df$bmi,main ="Three Sigma edit rule")
abline(h = up,lty=2,lwd=1, col = "red")
abline(h = down, lty = 2, lwd =1, col = "red")

par(mfrow=c(1,1))
plot(df$bmi,main = "Hampel identifier")
abline(h = ups, lty = 2, lwd =1, col = "red")
abline(h = downs, lty = 2, lwd =1,col="red")

par(mfrow=c(1,1))
plot(df$bmi,main = "Boxplot Rule")
abline(h = upv,lty=2,lwd=1, col = "red")
abline(h = downv, lty = 2, lwd =1, col = "red")

cat("Three sigma rule\n")
## Three sigma rule
cat("UP : ")
## UP :
df$bmi[which(df$bmi > up)]
##  [1] 56 54 60 54 64 54 60 71 54 55 55 57 54 52 78 53 55 55 54 52 66 55 55 57 56
## [26] 57 54 56 97 53 53 52 52 55 53 63 52 61 58 52 53 59 52 52 54 61 53 54 55 57
## [51] 64 92 55 57 55 57 60 54 56
cat("Down : ")
## Down :
df$bmi[which(df$bmi < down)]
## numeric(0)
cat("\nHampeL identifier:\n")
## 
## HampeL identifier:
cat("Up : ")
## Up :
df$bmi[which(df$bmi > ups)]
##   [1] 48 56 50 54 60 54 48 64 54 49 60 51 51 71 50 54 55 55 57 54 52 50 78 50 53
##  [26] 55 48 50 49 55 54 50 52 66 55 48 55 57 49 56 51 57 48 49 49 54 56 97 53 49
##  [51] 48 49 48 48 53 48 52 52 55 53 50 51 63 52 61 48 50 48 58 49 50 52 48 49 51
##  [76] 53 50 59 52 52 54 61 49 53 54 55 50 50 57 64 92 50 55 57 55 48 57 50 48 51
## [101] 60 54 56 49
cat("Down: ")
## Down:
df$bmi[which(df$bmi < downs)]
## numeric(0)
cat("\nBoxplot rule\n")
## 
## Boxplot rule
cat("up :")
## up :
df$bmi[which(df$bmi > upv)]
##   [1] 48 47 56 46 50 54 60 54 48 64 47 46 46 54 49 60 51 51 71 50 47 54 55 55 57
##  [26] 54 52 46 50 78 50 53 55 48 50 46 49 55 54 50 47 52 66 55 48 55 57 49 56 51
##  [51] 57 48 49 49 54 56 97 53 49 48 49 48 48 53 46 48 52 52 55 53 50 46 51 63 52
##  [76] 61 48 46 50 48 58 49 50 52 48 49 51 53 50 59 47 52 52 54 61 49 53 47 54 47
## [101] 55 46 50 50 57 46 64 92 50 55 46 57 47 55 48 57 47 46 46 50 47 48 51 60 47
## [126] 47 46 54 56 46 49 47 46
cat("down : ")
## down :
df$bmi[which(df$bmi< downv)]
##   [1] 16 18 17 17 16 18 16 18 16 15 16 16 17 17 18 16 13 18 15 17 17 17 16 16 16
##  [26] 18 17 18 16 17 17 12 18 14 16 18 16 17 17 17 14 17 18 15 12 16 18 15 18 15
##  [51] 17 18 18 18 18 16 18 17 16 18 16 18 17 18 17 18 18 17 14 16 14 15 18 17 18
##  [76] 18 17 18 17 16 17 18 17 18 16 17 15 17 18 18 16 16 17 16 17 15 15 18 18 18
## [101] 17 14 10 14 16 15 18 18 18 16 16 13 17 17 16 18 18 16 16 18 17 16 15 18 17
## [126] 18 17 16 18 14 18 17 16 18 18 17 16 17 18 18 16 17 17 17 17 15 16 16 17 13
## [151] 18 11 17 18 14 18 15 16 18 15 18 18 18 17 18 15 18 17 16 18 17 18 18 15 16
## [176] 17 18 18 18 17 17 17 18 18 18 17 17 18 17 15 15 16 14 15 14 16 18 17 17 17
## [201] 18 18 17 14 18 16 17 17 15 16 17 17 17 14 18 18 18 15 18 18 17 18 18 17 17
## [226] 16 18 15 18 15 15 16 17 16 17 17 11 18 17 12 13 18 14 16 15 16 16 18 14 18
## [251] 15 14 16 15 14 18 16 18 17 17 18 15 15 17 13 14 15 16 18 15 18 18 17 18 15
## [276] 18 18 16 18 16 15 16 16 16 15 18 17 17 18 18 18 18 18 18 16 17 17 18 18 13
## [301] 16 13 16 17 17 15 16 18 13 16 17 17 17 17 17 16 18 17 14 17 18 16 14 18 15
## [326] 14 17 17 16 16 17 17 16 16 16 18 14 18 18 17 18 18 16 18 18 17 14 16 18 18
## [351] 17 18 16 16 14 16 15 18 13 17 17 16 15 14 17 18 16 16 17 14 17 18 18 18 17
## [376] 16 18 18 18 18 18 18 16 18 13 16 14 16 15 16 15 17 18 17 15 17 18 16 15 17
## [401] 16 17 17 18

As we can see, it only detect few outlier in BMI

#Checking correlation between data using hetcor()

library(polycor)
hetcor(df)
## Warning in FUN(X[[i]], ...): polychoric correlation between variables work_type and smoking_status produced warnings:
##    NaNs produced
##    NaNs produced
##    NaNs produced
##    NaNs produced
##    NaNs produced
##    NaNs produced
##    NaNs produced
## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##                          id     gender        age hypertension heart_disease
## id                        1 Polyserial    Pearson      Pearson       Pearson
## gender             0.003529          1 Polyserial   Polyserial    Polyserial
## age                0.003538   -0.03568          1      Pearson       Pearson
## hypertension        0.00355    0.02615     0.2764            1       Pearson
## heart_disease     -0.001296     0.1052     0.2638       0.1083             1
## ever_married        0.01768     -0.051      0.771       0.2611        0.1841
## work_type         -0.002344   -0.09329     0.5895       0.1591        0.1211
## Residence_type    -0.001759   -0.01114    0.01778    -0.009918      0.003877
## avg_glucose_level  0.001092    0.06914     0.2382       0.1745        0.1619
## bmi                0.007521   -0.03509     0.3234       0.1583       0.03639
## smoking_status     -0.01948     0.0376    -0.4118      -0.1327      -0.07388
## stroke             0.006388    0.01111     0.2453       0.1279        0.1349
##                   ever_married  work_type Residence_type avg_glucose_level
## id                  Polyserial Polyserial     Polyserial           Pearson
## gender              Polychoric Polychoric     Polychoric        Polyserial
## age                 Polyserial Polyserial     Polyserial           Pearson
## hypertension        Polyserial Polyserial     Polyserial           Pearson
## heart_disease       Polyserial Polyserial     Polyserial           Pearson
## ever_married                 1 Polychoric     Polychoric        Polyserial
## work_type               0.5715          1     Polychoric        Polyserial
## Residence_type         0.01013    0.00173              1        Polyserial
## avg_glucose_level       0.2228     0.1076      -0.006199                 1
## bmi                       0.44     0.3297      -0.001166            0.1659
## smoking_status         -0.4163    -0.3824         0.0032           -0.1055
## stroke                  0.1743     0.0994        0.01941            0.1319
##                          bmi smoking_status     stroke
## id                   Pearson     Polyserial    Pearson
## gender            Polyserial     Polychoric Polyserial
## age                  Pearson     Polyserial    Pearson
## hypertension         Pearson     Polyserial    Pearson
## heart_disease        Pearson     Polyserial    Pearson
## ever_married      Polyserial     Polychoric Polyserial
## work_type         Polyserial     Polychoric Polyserial
## Residence_type    Polyserial     Polychoric Polyserial
## avg_glucose_level    Pearson     Polyserial    Pearson
## bmi                        1     Polyserial    Pearson
## smoking_status        -0.245              1 Polyserial
## stroke               0.03504       -0.07464          1
## 
## Standard Errors:
##                        id  gender      age hypertension heart_disease
## id                                                                   
## gender            0.01766                                            
## age               0.01399  0.0176                                    
## hypertension      0.01399 0.01756  0.01292                           
## heart_disease     0.01399 0.01732  0.01302      0.01383              
## ever_married      0.01806 0.02273 0.007272      0.02068       0.02155
## work_type         0.01538 0.01925 0.009173      0.01498       0.01526
## Residence_type    0.01754 0.02213  0.01753      0.01753       0.01754
## avg_glucose_level 0.01399 0.01745   0.0132      0.01356       0.01362
## bmi               0.01399 0.01774  0.01253      0.01364       0.01397
## smoking_status    0.01517 0.01909  0.01209      0.01462       0.01495
## stroke            0.01399  0.0176  0.01315      0.01376       0.01374
##                   ever_married work_type Residence_type avg_glucose_level
## id                                                                       
## gender                                                                   
## age                                                                      
## hypertension                                                             
## heart_disease                                                            
## ever_married                                                             
## work_type               0.0143                                           
## Residence_type         0.02263   0.01928                                 
## avg_glucose_level      0.01883   0.01518        0.01753                  
## bmi                    0.01512   0.01326        0.01754           0.01361
## smoking_status         0.01669   0.01413        0.01903           0.01486
## stroke                 0.02167   0.01532        0.01756           0.01375
##                       bmi smoking_status
## id                                      
## gender                                  
## age                                     
## hypertension                            
## heart_disease                           
## ever_married                            
## work_type                               
## Residence_type                          
## avg_glucose_level                       
## bmi                                     
## smoking_status    0.01398               
## stroke            0.01397        0.01501
## 
## n = 5110 
## 
## P-values for Tests of Bivariate Normality:
##                          id    gender        age hypertension heart_disease
## id                                                                         
## gender            6.497e-51                                                
## age               1.668e-65 2.519e-20                                      
## hypertension              0         0          0                           
## heart_disease             0         0          0            0              
## ever_married       9.89e-53    0.1853  3.86e-144            0             0
## work_type                 0  0.006454          0            0             0
## Residence_type    1.585e-53    0.2443  3.445e-17            0             0
## avg_glucose_level         0 1.02e-223 1.482e-277            0             0
## bmi               4.839e-76 3.674e-36  2.82e-176            0             0
## smoking_status    1.733e-48 4.088e-10  2.974e-71            0             0
## stroke                    0         0          0            0             0
##                   ever_married  work_type Residence_type avg_glucose_level
## id                                                                        
## gender                                                                    
## age                                                                       
## hypertension                                                              
## heart_disease                                                             
## ever_married                                                              
## work_type           1.004e-191                                            
## Residence_type            <NA>     0.1941                                 
## avg_glucose_level   6.941e-243          0      8.06e-227                  
## bmi                  2.038e-73          0      1.205e-29        3.352e-259
## smoking_status       1.297e-24 2.373e-163        0.06799        3.551e-230
## stroke                       0          0              0                 0
##                         bmi smoking_status
## id                                        
## gender                                    
## age                                       
## hypertension                              
## heart_disease                             
## ever_married                              
## work_type                                 
## Residence_type                            
## avg_glucose_level                         
## bmi                                       
## smoking_status    5.334e-69               
## stroke                    0              0

From that, we know that stroke has lots of affect from one of these such as age, hypertension, heartdisease, ever_married, avg glucose #checking each data proportion by pie chart

pie_chart<- function(data){
  dataset<- table(df$stroke)
  labels <- names(dataset)
  pct <- prop.table(dataset)*100
  labels2 <- paste(round(pct),"%",sep = "")
  labels3<- paste (labels, labels2)
  pie(dataset,labels = labels3, init.angle = 90, main = "Stroke's Proportion")
}
pie_chart2<- function(data){
  dataset<- table(df$gender)
  labels <- names(dataset)
  pct <- prop.table(dataset)*100
  labels2 <- paste(round(pct),"%",sep = "")
  labels3<- paste (labels, labels2)
  pie(dataset,labels = labels3, init.angle = 90, main = "Gender's Proportion")
}
pie_chart3<- function(data){
  dataset<- table(df$work_type)
  labels <- names(dataset)
  pct <- prop.table(dataset)*100
  labels2 <- paste(round(pct),"%",sep = "")
  labels3<- paste (labels, labels2)
  pie(dataset,labels = labels3, init.angle = 90, main = "Work Type's Proportion")
}
pie_chart4<- function(data){
  dataset<- table(df$ever_married)
  labels <- names(dataset)
  pct <- prop.table(dataset)*100
  labels2 <- paste(round(pct),"%",sep = "")
  labels3<- paste (labels, labels2)
  pie(dataset,labels = labels3, init.angle = 90, main = "Ever Married's Proportion")
}
pie_chart5<- function(data){
  dataset<- table(df$heart_disease)
  labels <- names(dataset)
  pct <- prop.table(dataset)*100
  labels2 <- paste(round(pct),"%",sep = "")
  labels3<- paste (labels, labels2)
  pie(dataset,labels = labels3, init.angle = 90, main = "Heart Disease's Proportion")
}
pie_chart6<- function(data){
  dataset<- table(df$hypertension)
  labels <- names(dataset)
  pct <- prop.table(dataset)*100
  labels2 <- paste(round(pct),"%",sep = "")
  labels3<- paste (labels, labels2)
  pie(dataset,labels = labels3, init.angle = 90, main = "Hypertension's Proportion")
}
par(mfrow=c(2,2))
pie_chart(df)
pie_chart2(df)
pie_chart3(df)
pie_chart4(df)

par(mfrow=c(2,1))
pie_chart5(df)
pie_chart6(df)

Now we can see their proportion data set.

#Logistic Regression

trainingData <- subset(df, select = c(2, 3, 4, 5, 6, 7,8,9,10, 11, 12))
head(trainingData)
##   gender age hypertension heart_disease ever_married     work_type
## 1   Male  67            0             1          Yes       Private
## 2 Female  61            0             0          Yes Self-employed
## 3   Male  80            0             1          Yes       Private
## 4 Female  49            0             0          Yes       Private
## 5 Female  79            1             0          Yes Self-employed
## 6   Male  81            0             0          Yes       Private
##   Residence_type avg_glucose_level      bmi  smoking_status stroke
## 1          Urban            228.69 36.00000 formerly smoked      1
## 2          Rural            202.21 27.32955    never smoked      1
## 3          Rural            105.92 32.00000    never smoked      1
## 4          Urban            171.23 34.00000          smokes      1
## 5          Rural            174.12 24.00000    never smoked      1
## 6          Urban            186.21 29.00000 formerly smoked      1

remove Id, because ID is a primary key or an unique data which mean we can’t do anything with it #creating Training Data & Validation Data by splitting ratio 0.8 or 80%

sapply(trainingData, function(x) sum(is.na(x)))
##            gender               age      hypertension     heart_disease 
##                 0                 0                 0                 0 
##      ever_married         work_type    Residence_type avg_glucose_level 
##                 0                 0                 0                 0 
##               bmi    smoking_status            stroke 
##                 0                 0                 0
temp <- sample.split(trainingData, SplitRatio = 0.8)
trainingsample <- subset(trainingData, temp == TRUE)
validsample <- subset(trainingData, temp == FALSE)
dim(trainingsample)
## [1] 3717   11
dim(validsample)
## [1] 1393   11
dim(df)
## [1] 5110   12

Before we continue, we do a split ratio up to 80 : 20. after that we can check their dimension to make sure that they are splitted by 80 : 20 #Modelling

model1 <- glm(stroke ~ . , family=binomial(link="logit"), data=trainingsample)
summary(model1)
## 
## Call:
## glm(formula = stroke ~ ., family = binomial(link = "logit"), 
##     data = trainingsample)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0057  -0.3208  -0.1663  -0.0948   3.4758  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -6.497e+00  8.105e-01  -8.017 1.08e-15 ***
## genderMale                 -3.234e-02  1.667e-01  -0.194  0.84619    
## genderOther                -1.055e+01  1.455e+03  -0.007  0.99422    
## age                         7.421e-02  6.768e-03  10.965  < 2e-16 ***
## hypertension                3.534e-01  1.948e-01   1.814  0.06973 .  
## heart_disease               3.758e-01  2.188e-01   1.718  0.08587 .  
## ever_marriedYes            -2.514e-01  2.621e-01  -0.960  0.33730    
## work_typeGovt_job          -1.188e+00  8.727e-01  -1.361  0.17359    
## work_typeNever_worked      -1.068e+01  3.746e+02  -0.029  0.97725    
## work_typePrivate           -1.078e+00  8.524e-01  -1.264  0.20617    
## work_typeSelf-employed     -1.365e+00  8.773e-01  -1.555  0.11987    
## Residence_typeUrban         4.990e-03  1.619e-01   0.031  0.97541    
## avg_glucose_level           4.072e-03  1.419e-03   2.869  0.00411 ** 
## bmi                         2.131e-03  1.330e-02   0.160  0.87270    
## smoking_statusnever smoked -9.566e-02  2.093e-01  -0.457  0.64758    
## smoking_statussmokes        3.230e-01  2.470e-01   1.308  0.19101    
## smoking_statusUnknown       1.150e-02  2.480e-01   0.046  0.96303    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1453.0  on 3716  degrees of freedom
## Residual deviance: 1160.2  on 3700  degrees of freedom
## AIC: 1194.2
## 
## Number of Fisher Scoring iterations: 14

In this first model, we can see that only age that has the most correlation between stroke. for the second model, let’s try using the data that has correlation between stroke which we have stated before

model2 <- glm(stroke ~  age + hypertension+ heart_disease+ ever_married +avg_glucose_level, family=binomial(link="logit"), data=trainingsample)
summary(model2)
## 
## Call:
## glm(formula = stroke ~ age + hypertension + heart_disease + ever_married + 
##     avg_glucose_level, family = binomial(link = "logit"), data = trainingsample)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9857  -0.3213  -0.1753  -0.0930   3.6944  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -7.210080   0.421812 -17.093  < 2e-16 ***
## age                0.068335   0.005982  11.423  < 2e-16 ***
## hypertension       0.350413   0.192295   1.822  0.06841 .  
## heart_disease      0.417002   0.215631   1.934  0.05313 .  
## ever_marriedYes   -0.269192   0.252836  -1.065  0.28702    
## avg_glucose_level  0.004215   0.001371   3.074  0.00211 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1453.0  on 3716  degrees of freedom
## Residual deviance: 1167.1  on 3711  degrees of freedom
## AIC: 1179.1
## 
## Number of Fisher Scoring iterations: 7
model3 <- glm(stroke ~age + hypertension, family=binomial(link="logit"), data=trainingsample)
summary(model3)
## 
## Call:
## glm(formula = stroke ~ age + hypertension, family = binomial(link = "logit"), 
##     data = trainingsample)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8637  -0.3323  -0.1828  -0.0866   3.7390  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -7.083289   0.384829 -18.406   <2e-16 ***
## age           0.071433   0.005751  12.421   <2e-16 ***
## hypertension  0.431841   0.189101   2.284   0.0224 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1453.0  on 3716  degrees of freedom
## Residual deviance: 1182.5  on 3714  degrees of freedom
## AIC: 1188.5
## 
## Number of Fisher Scoring iterations: 7

For this model, we use the most correlation which is age, but it is only 2 data that has the most correlation so we can add the bmi that we can used to create decision tree model later

#ROC Curve

model3 <- glm(stroke ~age + hypertension + bmi, family=binomial(link="logit"), data=trainingsample)
prediksi <- predict(model3, newdata=subset(validsample, select = c(2,3,9,11)),type = "response")
pd <- prediction(prediksi, validsample$stroke)
rocCurve <- performance(pd, measure="tpr", x.measure="fpr")
plot(rocCurve)

auc <- performance(pd, measure="auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8518437

In here we get AUC about 0.799 which mean it is already good. because the higher the AUC will create the better model which mean it will increase the accuracy of the data set prediction, now lets see the data set accuracy #Accuracy

result <- ifelse(prediksi>0.5, 1, 0)
misclassificationError <- mean(result != validsample$stroke)
print(paste("Accuracy : ", 1-misclassificationError))
## [1] "Accuracy :  0.951902368987796"

And we get accuracy about 0.95 or 95% accuracy dataset prediction this mean it is good dataset for a prediction

library(rpart)
library(rpart.plot)

now let’s continue creating decision tree modelling with our modelling before.

#Decision Tree Modelling

DTmodel<-rpart(stroke~ age + hypertension + bmi, data=trainingsample, method="class", cp = 0.0000000000001)#cp = complexity parameter
DTmodel
## n= 3717 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 3717 182 0 (0.95103578 0.04896422)  
##     2) age< 67.5 3110  75 0 (0.97588424 0.02411576) *
##     3) age>=67.5 607 107 0 (0.82372323 0.17627677)  
##       6) age< 75.5 283  36 0 (0.87279152 0.12720848) *
##       7) age>=75.5 324  71 0 (0.78086420 0.21913580)  
##        14) bmi>=27.66477 156  28 0 (0.82051282 0.17948718)  
##          28) age< 79.5 91  13 0 (0.85714286 0.14285714) *
##          29) age>=79.5 65  15 0 (0.76923077 0.23076923)  
##            58) age>=80.5 41   7 0 (0.82926829 0.17073171) *
##            59) age< 80.5 24   8 0 (0.66666667 0.33333333)  
##             118) hypertension< 0.5 17   4 0 (0.76470588 0.23529412) *
##             119) hypertension>=0.5 7   3 1 (0.42857143 0.57142857) *
##        15) bmi< 27.66477 168  43 0 (0.74404762 0.25595238) *
rpart.plot(DTmodel, extra = 106)

prediksiDT<-predict(DTmodel, validsample, type="class")
tempsample<-table(prediksiDT, validsample$stroke)
tempsample
##           
## prediksiDT    0    1
##          0 1321   67
##          1    5    0

#o

sum(diag(tempsample)) / sum(tempsample)
## [1] 0.948313
1- sum(diag(tempsample)) / sum(tempsample)
## [1] 0.05168701

in here, we can see that the overall accuracy for DT model is significantly high which is about 0.94 or 94.18% and we get 0.05 or 5% misclassification accuracy which make this DT model pretty accurate

#Conclusion This data set is good for modelling