#Import Library needed
library(ROCR)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(visdat)
library(reshape2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(ggplot2)
library(DataExplorer)
library(caTools)
First, we import library that we needs
df <- read.csv("StrokeData.csv", stringsAsFactors= TRUE)
head(df)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 2 Rural 202.21 N/A never smoked 1
## 3 Rural 105.92 32.5 never smoked 1
## 4 Urban 171.23 34.4 smokes 1
## 5 Rural 174.12 24 never smoked 1
## 6 Urban 186.21 29 formerly smoked 1
str(df)
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 2 1 2 1 1 2 2 1 1 1 ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
## $ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 5 4 4 5 4 4 4 4 4 ...
## $ Residence_type : Factor w/ 2 levels "Rural","Urban": 2 1 1 2 1 2 1 2 1 2 ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : Factor w/ 419 levels "10.3","11.3",..: 240 419 199 218 114 164 148 102 419 116 ...
## $ smoking_status : Factor w/ 4 levels "formerly smoked",..: 1 2 2 3 2 1 2 2 4 4 ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
levels(df$gender)
## [1] "Female" "Male" "Other"
#Attribut Id = unique data gender = Male, Female, Other age = age in years hypertension = (1 = have hypertension , 0 = otherwise) heart_disease = (1 = have heart_disease, 0 = otherwise) ever_married = (1 = yes, 0 = no) work_type = children, Govt_job, Never_worked, Private, Self_employed Residence_type = Rural, Urban avg_glucose_level= average glucose patient bmi = body mass index of the patient smoking_status = formerly smoked, never smoked, smokes, Unknown stroke = (1 = have stroke, 0 = doesn’t have stroke) #Explorasi Data Analysis
dim(df)
## [1] 5110 12
#Checking dimension
summary(df)
## id gender age hypertension
## Min. : 67 Female:2994 Min. : 0.08 Min. :0.00000
## 1st Qu.:17741 Male :2115 1st Qu.:25.00 1st Qu.:0.00000
## Median :36932 Other : 1 Median :45.00 Median :0.00000
## Mean :36518 Mean :43.23 Mean :0.09746
## 3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000
## Max. :72940 Max. :82.00 Max. :1.00000
##
## heart_disease ever_married work_type Residence_type
## Min. :0.00000 No :1757 children : 687 Rural:2514
## 1st Qu.:0.00000 Yes:3353 Govt_job : 657 Urban:2596
## Median :0.00000 Never_worked : 22
## Mean :0.05401 Private :2925
## 3rd Qu.:0.00000 Self-employed: 819
## Max. :1.00000
##
## avg_glucose_level bmi smoking_status stroke
## Min. : 55.12 N/A : 201 formerly smoked: 885 Min. :0.00000
## 1st Qu.: 77.25 28.7 : 41 never smoked :1892 1st Qu.:0.00000
## Median : 91.89 28.4 : 38 smokes : 789 Median :0.00000
## Mean :106.15 26.1 : 37 Unknown :1544 Mean :0.04873
## 3rd Qu.:114.09 26.7 : 37 3rd Qu.:0.00000
## Max. :271.74 27.6 : 37 Max. :1.00000
## (Other):4719
In this summary dataset, we can see that bmi has 201 missing value. instead of removing it, we can use the bmi avg to fill the missing value.
#Changing N/A value in bmi column to the bmi average
df$bmi = as.character(df$bmi)
df$bmi[df$bmi == "N/A"]<- "0"
df$bmi= as.integer(df$bmi)
df$bmi[df$bmi == 0]<-mean(df$bmi)
df$bmi[is.na(df$bmi)]=0
head(df)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.00000 formerly smoked 1
## 2 Rural 202.21 27.32955 never smoked 1
## 3 Rural 105.92 32.00000 never smoked 1
## 4 Urban 171.23 34.00000 smokes 1
## 5 Rural 174.12 24.00000 never smoked 1
## 6 Urban 186.21 29.00000 formerly smoked 1
hist(df$avg_glucose_level)
hist(df$bmi)
Before we get continue, let’s find the outlier for avg glucose and bmi
#Checking outlier using Three Sigma, Hampel, and Boxplot Rule
#three sigma
x<- df$avg_glucose_level
t <- 3
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
up <- mu + t * sig
down<- mu - t * sig
#hampel
a<-df$avg_glucose_level
mus <- median(a, na.rm = TRUE)
sigs <- mad(a, na.rm = TRUE)
ups <- mu + t * sigs
downs <- mu - t * sigs
#boxplot rule
t <- 1.5
x<- df$avg_glucose_level
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
upv <- xU + t * Q
downv <- xU - t * Q
par(mfrow=c(1,1))
plot(df$avg_glucose_level,main ="Three Sigma edit rule")
abline(h = up,lty=2,lwd=1, col = "red")
abline(h = down, lty = 2, lwd =1, col = "red")
par(mfrow=c(1,1))
plot(df$avg_glucose_level,main = "Hampel identifier")
abline(h = ups, lty = 2, lwd =1, col = "red")
abline(h = downs, lty = 2, lwd =1,col="red")
par(mfrow=c(1,1))
plot(df$avg_glucose_level,main = "Boxplot Rule")
abline(h = upv,lty=2,lwd=1, col = "red")
abline(h = downv, lty = 2, lwd =1, col = "red")
cat("Three sigma rule\n")
## Three sigma rule
cat("UP : ")
## UP :
df$avg_glucose_level[which(df$avg_glucose_level > up)]
## [1] 252.72 243.58 259.63 249.31 263.32 271.74 242.52 250.89 247.51 243.53
## [11] 242.30 243.50 251.60 247.69 250.20 254.60 254.63 246.34 251.46 267.76
## [21] 246.53 244.28 251.99 253.16 242.84 249.29 242.94 247.48 266.59 243.73
## [31] 243.59 250.80 255.17 267.61 260.85 248.37 263.56 247.97 248.24 253.93
## [41] 254.95 247.87 261.67 256.74 244.30 242.62 243.52 267.60 253.86
cat("Down : ")
## Down :
df$avg_glucose_level[which(df$avg_glucose_level < down)]
## numeric(0)
cat("\nHampeL identifier:\n")
##
## HampeL identifier:
cat("Up : ")
## Up :
df$avg_glucose_level[which(df$avg_glucose_level > ups)]
## [1] 228.69 202.21 186.21 219.84 214.09 191.61 221.29 217.08 193.94 233.29
## [11] 228.70 208.30 189.84 195.23 211.78 212.08 196.92 252.72 219.72 213.03
## [21] 243.58 197.54 196.71 237.75 194.99 185.17 221.58 228.56 240.09 226.98
## [31] 235.63 240.59 190.32 231.61 191.82 224.10 216.94 259.63 249.31 219.91
## [41] 200.59 190.14 206.09 263.32 207.28 194.37 199.20 221.79 239.07 223.83
## [51] 231.56 221.89 195.71 203.87 185.49 213.22 215.94 209.86 205.77 271.74
## [61] 200.62 242.52 208.65 205.33 210.40 199.86 219.73 250.89 205.35 216.58
## [71] 184.40 199.84 218.46 211.06 197.28 233.94 247.51 210.95 243.53 205.84
## [81] 198.21 206.72 214.45 190.70 203.04 242.30 220.49 218.46 216.70 234.58
## [91] 235.85 243.50 229.92 215.60 239.64 200.28 205.23 209.58 210.78 251.60
## [101] 213.37 223.36 203.81 205.26 211.03 225.47 227.10 201.76 217.30 196.01
## [111] 198.69 186.17 210.48 193.83 247.69 191.47 239.82 189.57 207.58 215.64
## [121] 196.36 188.11 205.50 204.86 228.08 219.53 219.97 214.05 200.49 240.71
## [131] 197.10 194.62 222.21 250.20 254.60 212.01 186.45 189.49 186.32 226.70
## [141] 194.04 237.15 231.19 207.32 207.64 236.84 204.63 232.89 195.03 227.91
## [151] 204.50 206.25 254.63 246.34 195.16 223.68 229.20 193.22 204.57 251.46
## [161] 220.52 195.04 218.65 211.49 224.71 226.11 210.94 230.68 198.02 204.17
## [171] 267.76 217.71 239.52 229.86 210.96 195.25 217.39 201.25 197.79 214.77
## [181] 189.45 206.40 197.58 199.96 205.77 237.21 246.53 206.33 206.98 227.28
## [191] 228.70 244.28 251.99 191.79 216.88 222.29 213.11 227.51 201.01 210.00
## [201] 237.58 207.45 226.93 253.16 238.53 207.79 196.20 231.76 216.92 194.98
## [211] 218.54 237.17 197.09 242.84 202.66 216.90 210.00 208.05 222.60 199.14
## [221] 191.48 200.16 190.40 215.90 233.52 213.54 188.69 219.50 217.66 227.16
## [231] 209.90 211.88 225.60 210.23 234.82 230.59 224.63 185.71 208.17 185.31
## [241] 203.04 187.87 213.87 222.85 198.36 196.25 194.53 204.05 199.18 209.26
## [251] 217.11 222.46 187.52 237.74 223.35 201.07 208.06 186.95 198.24 229.21
## [261] 209.06 228.42 212.97 202.05 206.25 231.69 219.96 197.69 199.88 208.78
## [271] 222.29 220.36 187.88 191.66 217.75 226.88 186.40 203.81 189.44 249.29
## [281] 211.35 206.59 196.33 242.94 226.75 185.00 199.83 227.81 240.81 239.28
## [291] 231.50 192.37 220.47 196.91 247.48 216.00 219.39 220.47 198.33 191.33
## [301] 206.52 216.96 232.81 207.95 229.58 187.22 227.04 214.42 233.71 216.40
## [311] 266.59 227.94 205.00 203.44 243.73 200.28 221.43 213.38 192.16 215.72
## [321] 202.57 209.50 203.16 201.45 206.15 196.61 219.92 231.95 216.38 213.33
## [331] 243.59 227.98 208.20 199.42 190.13 235.54 227.74 213.80 250.80 217.84
## [341] 217.00 217.40 190.92 255.17 217.55 227.96 231.71 196.81 222.66 223.58
## [351] 198.79 192.39 233.30 201.38 236.14 193.81 239.95 202.21 198.79 202.55
## [361] 232.12 203.57 230.78 204.98 227.89 216.71 202.67 221.80 202.38 215.81
## [371] 220.24 195.61 267.61 207.62 201.58 231.43 220.26 211.12 215.33 212.02
## [381] 228.20 260.85 223.90 207.96 205.01 191.78 214.43 220.64 204.77 248.37
## [391] 194.53 228.92 227.68 226.73 219.17 215.92 198.12 240.86 263.56 200.14
## [401] 235.45 207.71 228.05 223.14 214.51 231.31 238.78 233.59 188.13 205.97
## [411] 190.89 193.87 214.77 189.88 197.11 192.47 199.38 202.98 198.32 226.38
## [421] 236.79 219.82 239.19 206.62 216.88 204.92 226.84 234.35 200.73 202.51
## [431] 218.00 209.15 202.66 196.50 209.50 219.81 205.23 234.27 239.21 196.08
## [441] 193.45 219.38 217.94 216.64 208.85 219.70 208.05 185.28 198.30 206.66
## [451] 200.68 218.60 223.26 221.83 218.10 200.46 217.79 233.47 200.98 219.67
## [461] 207.60 247.97 231.15 186.54 221.06 212.62 217.74 208.99 197.36 222.52
## [471] 232.64 207.37 201.96 213.43 248.24 229.94 202.06 253.93 194.75 207.84
## [481] 228.26 203.76 205.78 230.74 216.19 200.66 228.50 232.29 200.91 236.04
## [491] 254.95 196.58 189.82 193.61 195.74 221.24 192.50 212.92 191.94 247.87
## [501] 229.73 261.67 256.74 221.08 208.39 227.23 203.27 234.50 190.67 197.06
## [511] 216.07 203.87 235.06 195.43 200.25 223.64 199.78 244.30 223.16 226.28
## [521] 213.92 212.19 200.80 222.58 206.53 232.78 187.47 234.06 242.62 231.54
## [531] 219.80 187.99 234.45 240.69 217.57 234.51 214.73 208.69 231.72 206.53
## [541] 193.80 203.01 198.84 243.52 238.27 208.31 211.83 215.69 267.60 215.07
## [551] 225.35 196.26 212.87 185.27 206.49 253.86 203.36 191.15 223.78 211.58
## [561] 193.88
cat("Down: ")
## Down:
df$avg_glucose_level[which(df$avg_glucose_level < downs)]
## numeric(0)
cat("\nBoxplot rule\n")
##
## Boxplot rule
cat("up :")
## up :
df$avg_glucose_level[which(df$avg_glucose_level > upv)]
## [1] 228.69 202.21 171.23 174.12 186.21 219.84 214.09 191.61 221.29 217.08
## [11] 193.94 233.29 228.70 208.30 189.84 195.23 211.78 212.08 196.92 252.72
## [21] 219.72 213.03 243.58 197.54 196.71 237.75 194.99 180.93 185.17 221.58
## [31] 179.12 228.56 240.09 226.98 235.63 240.59 190.32 231.61 191.82 224.10
## [41] 216.94 259.63 249.31 219.91 200.59 190.14 182.99 206.09 263.32 207.28
## [51] 194.37 199.20 221.79 239.07 169.67 223.83 231.56 221.89 195.71 203.87
## [61] 185.49 213.22 215.94 209.86 205.77 271.74 200.62 242.52 175.29 208.65
## [71] 205.33 210.40 199.86 219.73 250.89 205.35 216.58 184.40 199.84 218.46
## [81] 211.06 197.28 233.94 247.51 210.95 243.53 205.84 198.21 206.72 214.45
## [91] 190.70 203.04 242.30 220.49 218.46 216.70 234.58 235.85 243.50 182.20
## [101] 229.92 215.60 239.64 200.28 205.23 209.58 210.78 251.60 213.37 223.36
## [111] 178.29 203.81 205.26 211.03 225.47 180.63 227.10 201.76 170.05 217.30
## [121] 196.01 184.15 198.69 186.17 183.45 210.48 193.83 183.34 247.69 191.47
## [131] 239.82 189.57 207.58 182.86 215.64 196.36 188.11 205.50 204.86 228.08
## [141] 219.53 219.97 214.05 200.49 240.71 197.10 194.62 222.21 250.20 173.43
## [151] 184.25 254.60 212.01 186.45 189.49 186.32 226.70 183.10 194.04 237.15
## [161] 231.19 207.32 207.64 236.84 204.63 232.89 195.03 170.95 227.91 204.50
## [171] 206.25 254.63 246.34 195.16 223.68 229.20 193.22 204.57 251.46 220.52
## [181] 195.04 218.65 211.49 224.71 226.11 210.94 230.68 198.02 204.17 267.76
## [191] 217.71 180.76 239.52 229.86 210.96 195.25 217.39 201.25 197.79 214.77
## [201] 181.23 189.45 206.40 178.76 197.58 199.96 205.77 237.21 246.53 206.33
## [211] 206.98 227.28 228.70 169.97 244.28 251.99 191.79 216.88 222.29 213.11
## [221] 227.51 201.01 210.00 237.58 207.45 226.93 253.16 238.53 207.79 196.20
## [231] 231.76 216.92 194.98 218.54 183.00 237.17 178.33 197.09 242.84 202.66
## [241] 216.90 210.00 208.05 222.60 199.14 191.48 200.16 190.40 215.90 233.52
## [251] 213.54 188.69 219.50 217.66 227.16 209.90 176.48 211.88 225.60 210.23
## [261] 234.82 230.59 224.63 185.71 208.17 185.31 203.04 187.87 213.87 222.85
## [271] 198.36 196.25 194.53 204.05 199.18 209.26 217.11 222.46 187.52 237.74
## [281] 223.35 201.07 208.06 186.95 198.24 229.21 209.06 228.42 212.97 202.05
## [291] 206.25 231.69 219.96 197.69 199.88 170.22 208.78 222.29 220.36 187.88
## [301] 191.66 217.75 226.88 186.40 169.49 203.81 170.76 189.44 249.29 211.35
## [311] 206.59 196.33 242.94 226.75 185.00 199.83 227.81 240.81 239.28 231.50
## [321] 192.37 220.47 196.91 180.80 247.48 216.00 219.39 220.47 173.96 198.33
## [331] 191.33 206.52 216.96 170.93 232.81 207.95 229.58 187.22 227.04 214.42
## [341] 233.71 216.40 266.59 227.94 205.00 203.44 243.73 176.25 200.28 221.43
## [351] 213.38 192.16 215.72 173.14 202.57 209.50 203.16 201.45 206.15 196.61
## [361] 219.92 231.95 216.38 213.33 172.33 243.59 169.43 183.87 227.98 208.20
## [371] 199.42 190.13 235.54 178.89 227.74 213.80 250.80 217.84 217.00 217.40
## [381] 190.92 182.90 255.17 217.55 227.96 231.71 196.81 222.66 223.58 198.79
## [391] 192.39 233.30 201.38 236.14 193.81 239.95 170.88 202.21 181.30 198.79
## [401] 202.55 232.12 203.57 230.78 204.98 227.89 216.71 202.67 221.80 202.38
## [411] 215.81 220.24 195.61 267.61 176.71 207.62 201.58 231.43 220.26 211.12
## [421] 177.91 215.33 212.02 228.20 260.85 223.90 169.74 207.96 176.78 205.01
## [431] 191.78 214.43 220.64 204.77 248.37 194.53 228.92 227.68 226.73 219.17
## [441] 215.92 198.12 240.86 263.56 200.14 235.45 207.71 228.05 223.14 174.43
## [451] 214.51 231.31 238.78 233.59 188.13 205.97 190.89 193.87 214.77 189.88
## [461] 197.11 192.47 199.38 202.98 198.32 226.38 236.79 219.82 239.19 206.62
## [471] 216.88 204.92 226.84 234.35 200.73 202.51 218.00 209.15 202.66 196.50
## [481] 209.50 219.81 205.23 234.27 239.21 196.08 176.38 175.74 193.45 180.45
## [491] 219.38 173.90 217.94 216.64 173.97 208.85 219.70 208.05 185.28 198.30
## [501] 206.66 200.68 218.60 223.26 172.27 221.83 218.10 200.46 217.79 233.47
## [511] 181.23 200.98 219.67 207.60 247.97 231.15 186.54 221.06 212.62 217.74
## [521] 208.99 197.36 222.52 232.64 207.37 201.96 213.43 248.24 229.94 202.06
## [531] 253.93 194.75 207.84 228.26 203.76 205.78 179.67 230.74 216.19 200.66
## [541] 228.50 232.29 200.91 236.04 254.95 196.58 189.82 193.61 195.74 221.24
## [551] 192.50 212.92 191.94 247.87 229.73 261.67 256.74 221.08 208.39 227.23
## [561] 203.27 234.50 190.67 197.06 216.07 179.14 203.87 235.06 195.43 200.25
## [571] 223.64 199.78 176.42 244.30 223.16 226.28 172.86 213.92 212.19 200.80
## [581] 222.58 206.53 232.78 187.47 234.06 242.62 174.54 231.54 219.80 187.99
## [591] 234.45 240.69 217.57 234.51 182.22 214.73 208.69 231.72 206.53 193.80
## [601] 203.01 177.56 198.84 243.52 238.27 208.31 176.34 211.83 215.69 267.60
## [611] 215.07 225.35 196.26 182.52 212.87 183.43 185.27 206.49 253.86 203.36
## [621] 175.92 191.15 223.78 211.58 179.38 193.88 174.37
cat("down : ")
## down :
df$avg_glucose_level[which(df$avg_glucose_level < downv)]
## [1] 58.57 58.09 57.92 56.11 57.93 57.08 57.33 55.78 55.42 56.96 58.63 55.39
## [13] 58.26 58.37 58.65 56.11 57.27 55.32 57.57 56.23 56.75 55.25 56.18 55.34
## [25] 55.83 55.26 55.27 58.47 56.77 56.99 57.51 56.31 57.82 57.94 58.39 57.93
## [37] 56.42 58.38 56.48 56.07 55.86 56.30 57.26 56.47 58.03 55.67 57.02 55.22
## [49] 57.89 57.37 56.07 58.48 58.30 58.71 55.51 58.29 56.63 57.33 57.95 58.25
## [61] 58.02 56.84 58.69 56.67 58.66 56.30 56.43 57.92 56.51 55.41 57.02 55.57
## [73] 58.14 55.34 55.61 58.70 56.90 57.30 58.51 56.34 55.62 56.71 55.58 56.12
## [85] 55.46 58.63 57.43 58.41 56.89 57.47 57.76 58.65 57.51 56.12 57.02 58.35
## [97] 57.28 58.42 56.37 56.11 55.64 56.08 56.33 56.95 57.77 57.17 57.80 56.42
## [109] 55.59 55.96 56.75 57.57 58.19 55.72 56.74 55.28 56.32 55.47 56.21 56.08
## [121] 57.42 58.01 56.85 58.24 56.79 57.60 56.13 58.55 57.59 55.23 55.35 57.46
## [133] 57.96 56.33 56.90 57.79 58.01 57.38 56.94 57.09 56.48 56.64 55.79 55.93
## [145] 58.66 57.40 57.56 57.56 58.42 57.52 58.23 58.64 56.11 58.19 57.10 57.28
## [157] 58.81 58.35 57.83 57.15 55.12 56.87 58.39 57.42 57.06 56.85 55.84 58.72
In Average Glucose Level detect alot of outlier which almost up to 25%
#three sigma
x<- df$bmi
t <- 3
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
up <- mu + t * sig
down<- mu - t * sig
#hampel
a<-df$bmi
mus <- median(a, na.rm = TRUE)
sigs <- mad(a, na.rm = TRUE)
ups <- mu + t * sigs
downs <- mu - t * sigs
#boxplot rule
t <- 1.5
x<- df$bmi
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
upv <- xU + t * Q
downv <- xU - t * Q
par(mfrow=c(1,1))
plot(df$bmi,main ="Three Sigma edit rule")
abline(h = up,lty=2,lwd=1, col = "red")
abline(h = down, lty = 2, lwd =1, col = "red")
par(mfrow=c(1,1))
plot(df$bmi,main = "Hampel identifier")
abline(h = ups, lty = 2, lwd =1, col = "red")
abline(h = downs, lty = 2, lwd =1,col="red")
par(mfrow=c(1,1))
plot(df$bmi,main = "Boxplot Rule")
abline(h = upv,lty=2,lwd=1, col = "red")
abline(h = downv, lty = 2, lwd =1, col = "red")
cat("Three sigma rule\n")
## Three sigma rule
cat("UP : ")
## UP :
df$bmi[which(df$bmi > up)]
## [1] 56 54 60 54 64 54 60 71 54 55 55 57 54 52 78 53 55 55 54 52 66 55 55 57 56
## [26] 57 54 56 97 53 53 52 52 55 53 63 52 61 58 52 53 59 52 52 54 61 53 54 55 57
## [51] 64 92 55 57 55 57 60 54 56
cat("Down : ")
## Down :
df$bmi[which(df$bmi < down)]
## numeric(0)
cat("\nHampeL identifier:\n")
##
## HampeL identifier:
cat("Up : ")
## Up :
df$bmi[which(df$bmi > ups)]
## [1] 48 56 50 54 60 54 48 64 54 49 60 51 51 71 50 54 55 55 57 54 52 50 78 50 53
## [26] 55 48 50 49 55 54 50 52 66 55 48 55 57 49 56 51 57 48 49 49 54 56 97 53 49
## [51] 48 49 48 48 53 48 52 52 55 53 50 51 63 52 61 48 50 48 58 49 50 52 48 49 51
## [76] 53 50 59 52 52 54 61 49 53 54 55 50 50 57 64 92 50 55 57 55 48 57 50 48 51
## [101] 60 54 56 49
cat("Down: ")
## Down:
df$bmi[which(df$bmi < downs)]
## numeric(0)
cat("\nBoxplot rule\n")
##
## Boxplot rule
cat("up :")
## up :
df$bmi[which(df$bmi > upv)]
## [1] 48 47 56 46 50 54 60 54 48 64 47 46 46 54 49 60 51 51 71 50 47 54 55 55 57
## [26] 54 52 46 50 78 50 53 55 48 50 46 49 55 54 50 47 52 66 55 48 55 57 49 56 51
## [51] 57 48 49 49 54 56 97 53 49 48 49 48 48 53 46 48 52 52 55 53 50 46 51 63 52
## [76] 61 48 46 50 48 58 49 50 52 48 49 51 53 50 59 47 52 52 54 61 49 53 47 54 47
## [101] 55 46 50 50 57 46 64 92 50 55 46 57 47 55 48 57 47 46 46 50 47 48 51 60 47
## [126] 47 46 54 56 46 49 47 46
cat("down : ")
## down :
df$bmi[which(df$bmi< downv)]
## [1] 16 18 17 17 16 18 16 18 16 15 16 16 17 17 18 16 13 18 15 17 17 17 16 16 16
## [26] 18 17 18 16 17 17 12 18 14 16 18 16 17 17 17 14 17 18 15 12 16 18 15 18 15
## [51] 17 18 18 18 18 16 18 17 16 18 16 18 17 18 17 18 18 17 14 16 14 15 18 17 18
## [76] 18 17 18 17 16 17 18 17 18 16 17 15 17 18 18 16 16 17 16 17 15 15 18 18 18
## [101] 17 14 10 14 16 15 18 18 18 16 16 13 17 17 16 18 18 16 16 18 17 16 15 18 17
## [126] 18 17 16 18 14 18 17 16 18 18 17 16 17 18 18 16 17 17 17 17 15 16 16 17 13
## [151] 18 11 17 18 14 18 15 16 18 15 18 18 18 17 18 15 18 17 16 18 17 18 18 15 16
## [176] 17 18 18 18 17 17 17 18 18 18 17 17 18 17 15 15 16 14 15 14 16 18 17 17 17
## [201] 18 18 17 14 18 16 17 17 15 16 17 17 17 14 18 18 18 15 18 18 17 18 18 17 17
## [226] 16 18 15 18 15 15 16 17 16 17 17 11 18 17 12 13 18 14 16 15 16 16 18 14 18
## [251] 15 14 16 15 14 18 16 18 17 17 18 15 15 17 13 14 15 16 18 15 18 18 17 18 15
## [276] 18 18 16 18 16 15 16 16 16 15 18 17 17 18 18 18 18 18 18 16 17 17 18 18 13
## [301] 16 13 16 17 17 15 16 18 13 16 17 17 17 17 17 16 18 17 14 17 18 16 14 18 15
## [326] 14 17 17 16 16 17 17 16 16 16 18 14 18 18 17 18 18 16 18 18 17 14 16 18 18
## [351] 17 18 16 16 14 16 15 18 13 17 17 16 15 14 17 18 16 16 17 14 17 18 18 18 17
## [376] 16 18 18 18 18 18 18 16 18 13 16 14 16 15 16 15 17 18 17 15 17 18 16 15 17
## [401] 16 17 17 18
As we can see, it only detect few outlier in BMI
#Checking correlation between data using hetcor()
library(polycor)
hetcor(df)
## Warning in FUN(X[[i]], ...): polychoric correlation between variables work_type and smoking_status produced warnings:
## NaNs produced
## NaNs produced
## NaNs produced
## NaNs produced
## NaNs produced
## NaNs produced
## NaNs produced
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## id gender age hypertension heart_disease
## id 1 Polyserial Pearson Pearson Pearson
## gender 0.003529 1 Polyserial Polyserial Polyserial
## age 0.003538 -0.03568 1 Pearson Pearson
## hypertension 0.00355 0.02615 0.2764 1 Pearson
## heart_disease -0.001296 0.1052 0.2638 0.1083 1
## ever_married 0.01768 -0.051 0.771 0.2611 0.1841
## work_type -0.002344 -0.09329 0.5895 0.1591 0.1211
## Residence_type -0.001759 -0.01114 0.01778 -0.009918 0.003877
## avg_glucose_level 0.001092 0.06914 0.2382 0.1745 0.1619
## bmi 0.007521 -0.03509 0.3234 0.1583 0.03639
## smoking_status -0.01948 0.0376 -0.4118 -0.1327 -0.07388
## stroke 0.006388 0.01111 0.2453 0.1279 0.1349
## ever_married work_type Residence_type avg_glucose_level
## id Polyserial Polyserial Polyserial Pearson
## gender Polychoric Polychoric Polychoric Polyserial
## age Polyserial Polyserial Polyserial Pearson
## hypertension Polyserial Polyserial Polyserial Pearson
## heart_disease Polyserial Polyserial Polyserial Pearson
## ever_married 1 Polychoric Polychoric Polyserial
## work_type 0.5715 1 Polychoric Polyserial
## Residence_type 0.01013 0.00173 1 Polyserial
## avg_glucose_level 0.2228 0.1076 -0.006199 1
## bmi 0.44 0.3297 -0.001166 0.1659
## smoking_status -0.4163 -0.3824 0.0032 -0.1055
## stroke 0.1743 0.0994 0.01941 0.1319
## bmi smoking_status stroke
## id Pearson Polyserial Pearson
## gender Polyserial Polychoric Polyserial
## age Pearson Polyserial Pearson
## hypertension Pearson Polyserial Pearson
## heart_disease Pearson Polyserial Pearson
## ever_married Polyserial Polychoric Polyserial
## work_type Polyserial Polychoric Polyserial
## Residence_type Polyserial Polychoric Polyserial
## avg_glucose_level Pearson Polyserial Pearson
## bmi 1 Polyserial Pearson
## smoking_status -0.245 1 Polyserial
## stroke 0.03504 -0.07464 1
##
## Standard Errors:
## id gender age hypertension heart_disease
## id
## gender 0.01766
## age 0.01399 0.0176
## hypertension 0.01399 0.01756 0.01292
## heart_disease 0.01399 0.01732 0.01302 0.01383
## ever_married 0.01806 0.02273 0.007272 0.02068 0.02155
## work_type 0.01538 0.01925 0.009173 0.01498 0.01526
## Residence_type 0.01754 0.02213 0.01753 0.01753 0.01754
## avg_glucose_level 0.01399 0.01745 0.0132 0.01356 0.01362
## bmi 0.01399 0.01774 0.01253 0.01364 0.01397
## smoking_status 0.01517 0.01909 0.01209 0.01462 0.01495
## stroke 0.01399 0.0176 0.01315 0.01376 0.01374
## ever_married work_type Residence_type avg_glucose_level
## id
## gender
## age
## hypertension
## heart_disease
## ever_married
## work_type 0.0143
## Residence_type 0.02263 0.01928
## avg_glucose_level 0.01883 0.01518 0.01753
## bmi 0.01512 0.01326 0.01754 0.01361
## smoking_status 0.01669 0.01413 0.01903 0.01486
## stroke 0.02167 0.01532 0.01756 0.01375
## bmi smoking_status
## id
## gender
## age
## hypertension
## heart_disease
## ever_married
## work_type
## Residence_type
## avg_glucose_level
## bmi
## smoking_status 0.01398
## stroke 0.01397 0.01501
##
## n = 5110
##
## P-values for Tests of Bivariate Normality:
## id gender age hypertension heart_disease
## id
## gender 6.497e-51
## age 1.668e-65 2.519e-20
## hypertension 0 0 0
## heart_disease 0 0 0 0
## ever_married 9.89e-53 0.1853 3.86e-144 0 0
## work_type 0 0.006454 0 0 0
## Residence_type 1.585e-53 0.2443 3.445e-17 0 0
## avg_glucose_level 0 1.02e-223 1.482e-277 0 0
## bmi 4.839e-76 3.674e-36 2.82e-176 0 0
## smoking_status 1.733e-48 4.088e-10 2.974e-71 0 0
## stroke 0 0 0 0 0
## ever_married work_type Residence_type avg_glucose_level
## id
## gender
## age
## hypertension
## heart_disease
## ever_married
## work_type 1.004e-191
## Residence_type <NA> 0.1941
## avg_glucose_level 6.941e-243 0 8.06e-227
## bmi 2.038e-73 0 1.205e-29 3.352e-259
## smoking_status 1.297e-24 2.373e-163 0.06799 3.551e-230
## stroke 0 0 0 0
## bmi smoking_status
## id
## gender
## age
## hypertension
## heart_disease
## ever_married
## work_type
## Residence_type
## avg_glucose_level
## bmi
## smoking_status 5.334e-69
## stroke 0 0
From that, we know that stroke has lots of affect from one of these such as age, hypertension, heartdisease, ever_married, avg glucose #checking each data proportion by pie chart
pie_chart<- function(data){
dataset<- table(df$stroke)
labels <- names(dataset)
pct <- prop.table(dataset)*100
labels2 <- paste(round(pct),"%",sep = "")
labels3<- paste (labels, labels2)
pie(dataset,labels = labels3, init.angle = 90, main = "Stroke's Proportion")
}
pie_chart2<- function(data){
dataset<- table(df$gender)
labels <- names(dataset)
pct <- prop.table(dataset)*100
labels2 <- paste(round(pct),"%",sep = "")
labels3<- paste (labels, labels2)
pie(dataset,labels = labels3, init.angle = 90, main = "Gender's Proportion")
}
pie_chart3<- function(data){
dataset<- table(df$work_type)
labels <- names(dataset)
pct <- prop.table(dataset)*100
labels2 <- paste(round(pct),"%",sep = "")
labels3<- paste (labels, labels2)
pie(dataset,labels = labels3, init.angle = 90, main = "Work Type's Proportion")
}
pie_chart4<- function(data){
dataset<- table(df$ever_married)
labels <- names(dataset)
pct <- prop.table(dataset)*100
labels2 <- paste(round(pct),"%",sep = "")
labels3<- paste (labels, labels2)
pie(dataset,labels = labels3, init.angle = 90, main = "Ever Married's Proportion")
}
pie_chart5<- function(data){
dataset<- table(df$heart_disease)
labels <- names(dataset)
pct <- prop.table(dataset)*100
labels2 <- paste(round(pct),"%",sep = "")
labels3<- paste (labels, labels2)
pie(dataset,labels = labels3, init.angle = 90, main = "Heart Disease's Proportion")
}
pie_chart6<- function(data){
dataset<- table(df$hypertension)
labels <- names(dataset)
pct <- prop.table(dataset)*100
labels2 <- paste(round(pct),"%",sep = "")
labels3<- paste (labels, labels2)
pie(dataset,labels = labels3, init.angle = 90, main = "Hypertension's Proportion")
}
par(mfrow=c(2,2))
pie_chart(df)
pie_chart2(df)
pie_chart3(df)
pie_chart4(df)
par(mfrow=c(2,1))
pie_chart5(df)
pie_chart6(df)
Now we can see their proportion data set.
#Logistic Regression
trainingData <- subset(df, select = c(2, 3, 4, 5, 6, 7,8,9,10, 11, 12))
head(trainingData)
## gender age hypertension heart_disease ever_married work_type
## 1 Male 67 0 1 Yes Private
## 2 Female 61 0 0 Yes Self-employed
## 3 Male 80 0 1 Yes Private
## 4 Female 49 0 0 Yes Private
## 5 Female 79 1 0 Yes Self-employed
## 6 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.00000 formerly smoked 1
## 2 Rural 202.21 27.32955 never smoked 1
## 3 Rural 105.92 32.00000 never smoked 1
## 4 Urban 171.23 34.00000 smokes 1
## 5 Rural 174.12 24.00000 never smoked 1
## 6 Urban 186.21 29.00000 formerly smoked 1
remove Id, because ID is a primary key or an unique data which mean we can’t do anything with it #creating Training Data & Validation Data by splitting ratio 0.8 or 80%
sapply(trainingData, function(x) sum(is.na(x)))
## gender age hypertension heart_disease
## 0 0 0 0
## ever_married work_type Residence_type avg_glucose_level
## 0 0 0 0
## bmi smoking_status stroke
## 0 0 0
temp <- sample.split(trainingData, SplitRatio = 0.8)
trainingsample <- subset(trainingData, temp == TRUE)
validsample <- subset(trainingData, temp == FALSE)
dim(trainingsample)
## [1] 3717 11
dim(validsample)
## [1] 1393 11
dim(df)
## [1] 5110 12
Before we continue, we do a split ratio up to 80 : 20. after that we can check their dimension to make sure that they are splitted by 80 : 20 #Modelling
model1 <- glm(stroke ~ . , family=binomial(link="logit"), data=trainingsample)
summary(model1)
##
## Call:
## glm(formula = stroke ~ ., family = binomial(link = "logit"),
## data = trainingsample)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0057 -0.3208 -0.1663 -0.0948 3.4758
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.497e+00 8.105e-01 -8.017 1.08e-15 ***
## genderMale -3.234e-02 1.667e-01 -0.194 0.84619
## genderOther -1.055e+01 1.455e+03 -0.007 0.99422
## age 7.421e-02 6.768e-03 10.965 < 2e-16 ***
## hypertension 3.534e-01 1.948e-01 1.814 0.06973 .
## heart_disease 3.758e-01 2.188e-01 1.718 0.08587 .
## ever_marriedYes -2.514e-01 2.621e-01 -0.960 0.33730
## work_typeGovt_job -1.188e+00 8.727e-01 -1.361 0.17359
## work_typeNever_worked -1.068e+01 3.746e+02 -0.029 0.97725
## work_typePrivate -1.078e+00 8.524e-01 -1.264 0.20617
## work_typeSelf-employed -1.365e+00 8.773e-01 -1.555 0.11987
## Residence_typeUrban 4.990e-03 1.619e-01 0.031 0.97541
## avg_glucose_level 4.072e-03 1.419e-03 2.869 0.00411 **
## bmi 2.131e-03 1.330e-02 0.160 0.87270
## smoking_statusnever smoked -9.566e-02 2.093e-01 -0.457 0.64758
## smoking_statussmokes 3.230e-01 2.470e-01 1.308 0.19101
## smoking_statusUnknown 1.150e-02 2.480e-01 0.046 0.96303
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1453.0 on 3716 degrees of freedom
## Residual deviance: 1160.2 on 3700 degrees of freedom
## AIC: 1194.2
##
## Number of Fisher Scoring iterations: 14
In this first model, we can see that only age that has the most correlation between stroke. for the second model, let’s try using the data that has correlation between stroke which we have stated before
model2 <- glm(stroke ~ age + hypertension+ heart_disease+ ever_married +avg_glucose_level, family=binomial(link="logit"), data=trainingsample)
summary(model2)
##
## Call:
## glm(formula = stroke ~ age + hypertension + heart_disease + ever_married +
## avg_glucose_level, family = binomial(link = "logit"), data = trainingsample)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9857 -0.3213 -0.1753 -0.0930 3.6944
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.210080 0.421812 -17.093 < 2e-16 ***
## age 0.068335 0.005982 11.423 < 2e-16 ***
## hypertension 0.350413 0.192295 1.822 0.06841 .
## heart_disease 0.417002 0.215631 1.934 0.05313 .
## ever_marriedYes -0.269192 0.252836 -1.065 0.28702
## avg_glucose_level 0.004215 0.001371 3.074 0.00211 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1453.0 on 3716 degrees of freedom
## Residual deviance: 1167.1 on 3711 degrees of freedom
## AIC: 1179.1
##
## Number of Fisher Scoring iterations: 7
model3 <- glm(stroke ~age + hypertension, family=binomial(link="logit"), data=trainingsample)
summary(model3)
##
## Call:
## glm(formula = stroke ~ age + hypertension, family = binomial(link = "logit"),
## data = trainingsample)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8637 -0.3323 -0.1828 -0.0866 3.7390
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.083289 0.384829 -18.406 <2e-16 ***
## age 0.071433 0.005751 12.421 <2e-16 ***
## hypertension 0.431841 0.189101 2.284 0.0224 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1453.0 on 3716 degrees of freedom
## Residual deviance: 1182.5 on 3714 degrees of freedom
## AIC: 1188.5
##
## Number of Fisher Scoring iterations: 7
For this model, we use the most correlation which is age, but it is only 2 data that has the most correlation so we can add the bmi that we can used to create decision tree model later
#ROC Curve
model3 <- glm(stroke ~age + hypertension + bmi, family=binomial(link="logit"), data=trainingsample)
prediksi <- predict(model3, newdata=subset(validsample, select = c(2,3,9,11)),type = "response")
pd <- prediction(prediksi, validsample$stroke)
rocCurve <- performance(pd, measure="tpr", x.measure="fpr")
plot(rocCurve)
auc <- performance(pd, measure="auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8518437
In here we get AUC about 0.799 which mean it is already good. because the higher the AUC will create the better model which mean it will increase the accuracy of the data set prediction, now lets see the data set accuracy #Accuracy
result <- ifelse(prediksi>0.5, 1, 0)
misclassificationError <- mean(result != validsample$stroke)
print(paste("Accuracy : ", 1-misclassificationError))
## [1] "Accuracy : 0.951902368987796"
And we get accuracy about 0.95 or 95% accuracy dataset prediction this mean it is good dataset for a prediction
library(rpart)
library(rpart.plot)
now let’s continue creating decision tree modelling with our modelling before.
#Decision Tree Modelling
DTmodel<-rpart(stroke~ age + hypertension + bmi, data=trainingsample, method="class", cp = 0.0000000000001)#cp = complexity parameter
DTmodel
## n= 3717
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3717 182 0 (0.95103578 0.04896422)
## 2) age< 67.5 3110 75 0 (0.97588424 0.02411576) *
## 3) age>=67.5 607 107 0 (0.82372323 0.17627677)
## 6) age< 75.5 283 36 0 (0.87279152 0.12720848) *
## 7) age>=75.5 324 71 0 (0.78086420 0.21913580)
## 14) bmi>=27.66477 156 28 0 (0.82051282 0.17948718)
## 28) age< 79.5 91 13 0 (0.85714286 0.14285714) *
## 29) age>=79.5 65 15 0 (0.76923077 0.23076923)
## 58) age>=80.5 41 7 0 (0.82926829 0.17073171) *
## 59) age< 80.5 24 8 0 (0.66666667 0.33333333)
## 118) hypertension< 0.5 17 4 0 (0.76470588 0.23529412) *
## 119) hypertension>=0.5 7 3 1 (0.42857143 0.57142857) *
## 15) bmi< 27.66477 168 43 0 (0.74404762 0.25595238) *
rpart.plot(DTmodel, extra = 106)
prediksiDT<-predict(DTmodel, validsample, type="class")
tempsample<-table(prediksiDT, validsample$stroke)
tempsample
##
## prediksiDT 0 1
## 0 1321 67
## 1 5 0
#o
sum(diag(tempsample)) / sum(tempsample)
## [1] 0.948313
1- sum(diag(tempsample)) / sum(tempsample)
## [1] 0.05168701
in here, we can see that the overall accuracy for DT model is significantly high which is about 0.94 or 94.18% and we get 0.05 or 5% misclassification accuracy which make this DT model pretty accurate
#Conclusion This data set is good for modelling