head(dt)
#a
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
#first omit N/A in data
#HINT: use the na.omit() function
stroke.data=na.omit(dt)
with(stroke.data,sum(is.na(stroke.data)))
## [1] 0
#convert categorical variables with numerical encoding
#E.g: stroke.data$gender <- as.numeric(factor(stroke.data$gender))
#hint: there should be 6 in total!
stroke.data$gender <- as.numeric(factor(stroke.data$gender))
stroke.data$ever_married <- as.numeric(factor(stroke.data$ever_married))
stroke.data$work_type <- as.numeric(factor(stroke.data$work_type))
stroke.data$Residence_type <- as.numeric(factor(stroke.data$Residence_type))
stroke.data$smoking_status <- as.numeric(factor(stroke.data$smoking_status))
stroke.data$bmi <- as.numeric(factor(stroke.data$bmi))
#head() allows you to preview the data
head(stroke.data)
#b
set.seed(1234)
training_ind = sample.int(nrow(stroke.data), nrow(stroke.data) * 0.7)
train.stroke = stroke.data[training_ind, ]
test.stroke = stroke.data[-training_ind, ]
#c
stroke.glm.1 = glm(stroke~., data = train.stroke, family = "binomial")
summary(stroke.glm.1)
##
## Call:
## glm(formula = stroke ~ ., family = "binomial", data = train.stroke)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2959 -0.3324 -0.1756 -0.0831 3.4440
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.785e+00 7.824e-01 -9.950 < 2e-16 ***
## id 6.156e-06 3.688e-06 1.669 0.095089 .
## gender -3.696e-02 1.620e-01 -0.228 0.819489
## age 7.402e-02 6.254e-03 11.837 < 2e-16 ***
## hypertension 1.426e-01 1.928e-01 0.740 0.459374
## heart_disease 1.444e-01 2.287e-01 0.631 0.527773
## ever_married -3.109e-01 2.467e-01 -1.260 0.207703
## work_type -5.852e-02 8.576e-02 -0.682 0.494976
## Residence_type 7.476e-02 1.587e-01 0.471 0.637646
## avg_glucose_level 4.131e-03 1.348e-03 3.064 0.002185 **
## bmi 3.406e-03 8.837e-04 3.855 0.000116 ***
## smoking_status -4.120e-03 7.630e-02 -0.054 0.956939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.6 on 3576 degrees of freedom
## Residual deviance: 1175.2 on 3565 degrees of freedom
## AIC: 1199.2
##
## Number of Fisher Scoring iterations: 7
#4 most significant variables are age, bmi, avg_glucose_level, id. However, because id has no reality mearning, we only use first three variables in the next model.
#d
stroke.glm.2 = glm(stroke~age+bmi+avg_glucose_level, data = train.stroke, family = "binomial")
summary(stroke.glm.2)
##
## Call:
## glm(formula = stroke ~ age + bmi + avg_glucose_level, family = "binomial",
## data = train.stroke)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2235 -0.3352 -0.1773 -0.0772 3.5985
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.2160633 0.4556835 -18.030 < 2e-16 ***
## age 0.0736033 0.0059148 12.444 < 2e-16 ***
## bmi 0.0032188 0.0008634 3.728 0.000193 ***
## avg_glucose_level 0.0042232 0.0013177 3.205 0.001351 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.6 on 3576 degrees of freedom
## Residual deviance: 1181.4 on 3573 degrees of freedom
## AIC: 1189.4
##
## Number of Fisher Scoring iterations: 7
#e
stroke.cor_mat = cor(stroke.data)[, "stroke"]
stroke.cor_mat[stroke.cor_mat > 0.12]
## age hypertension heart_disease avg_glucose_level
## 0.2452573 0.1279038 0.1349140 0.1319454
## stroke
## 1.0000000
corrplot(cor(stroke.data),method = "color", tl.col = "black")

stroke.glm.3 = glm(stroke~age+hypertension+heart_disease+avg_glucose_level, data = train.stroke, family = "binomial")
summary(stroke.glm.3)
##
## Call:
## glm(formula = stroke ~ age + hypertension + heart_disease + avg_glucose_level,
## family = "binomial", data = train.stroke)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0341 -0.3343 -0.1824 -0.0867 3.7536
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.484405 0.408436 -18.325 < 2e-16 ***
## age 0.069782 0.005890 11.847 < 2e-16 ***
## hypertension 0.222470 0.189829 1.172 0.241220
## heart_disease 0.170702 0.225282 0.758 0.448614
## avg_glucose_level 0.004950 0.001315 3.765 0.000167 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.6 on 3576 degrees of freedom
## Residual deviance: 1192.7 on 3572 degrees of freedom
## AIC: 1202.7
##
## Number of Fisher Scoring iterations: 7
#f
sum1 = summary(stroke.glm.1)
train_pred = predict(stroke.glm.1, data = train.stroke, type = "response")
test_pred = predict(stroke.glm.1, data = test.stroke, type = "response")
train.mse = mean((train_pred - train.stroke$stroke)^2)
test.mse = mean((test_pred - test.stroke$stroke)^2)
## Warning in test_pred - test.stroke$stroke: longer object length is not a
## multiple of shorter object length
RSS = sum((train_pred - train.stroke$stroke)^2)
starting_variance = sum((train.stroke$stroke - mean(train.stroke$stroke))^2)
train.r2 = 1 - RSS/starting_variance
train.mse
## [1] 0.04517531
test.mse
## [1] 0.05236293
train.r2
## [1] 0.1106369
#g
friend.bmi = stroke.data$bmi[which(dt$bmi == "17.7")[1]]
friend.smoking_status =
stroke.data$smoking_status[which(dt$smoking_status == "formerly smoked")[1]]
friend.data = data.frame(id = 00713, gender = 1, age = 20, hypertension = 0, heart_disease = 0, ever_married = 0, work_type = 1, Residence_type = 2, avg_glucose_level = 63.71, bmi = friend.bmi, smoking_status = friend.smoking_status)
friend.data
predict(stroke.glm.1, newdata = friend.data, type = "response")
## 1
## 0.002979697
ifelse(predict(stroke.glm.1, newdata = friend.data, type = "response") > 0.5,
"Have Stroke", "Will not Have Stroke")
## 1
## "Will not Have Stroke"