head(dt)
#a
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
#first omit N/A in data
#HINT: use the na.omit() function
stroke.data=na.omit(dt)
with(stroke.data,sum(is.na(stroke.data)))
## [1] 0
#convert categorical variables with numerical encoding
#E.g: stroke.data$gender <- as.numeric(factor(stroke.data$gender))
#hint: there should be 6 in total!
stroke.data$gender <- as.numeric(factor(stroke.data$gender))
stroke.data$ever_married <- as.numeric(factor(stroke.data$ever_married))
stroke.data$work_type <- as.numeric(factor(stroke.data$work_type))
stroke.data$Residence_type <- as.numeric(factor(stroke.data$Residence_type))
stroke.data$smoking_status <- as.numeric(factor(stroke.data$smoking_status))
stroke.data$bmi <- as.numeric(factor(stroke.data$bmi))
#head() allows you to preview the data
head(stroke.data)
#b
set.seed(1234)
training_ind = sample.int(nrow(stroke.data), nrow(stroke.data) * 0.7)
train.stroke = stroke.data[training_ind, ]
test.stroke = stroke.data[-training_ind, ]
#c
stroke.glm.1 = glm(stroke~., data = train.stroke, family = "binomial")
summary(stroke.glm.1)
## 
## Call:
## glm(formula = stroke ~ ., family = "binomial", data = train.stroke)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2959  -0.3324  -0.1756  -0.0831   3.4440  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -7.785e+00  7.824e-01  -9.950  < 2e-16 ***
## id                 6.156e-06  3.688e-06   1.669 0.095089 .  
## gender            -3.696e-02  1.620e-01  -0.228 0.819489    
## age                7.402e-02  6.254e-03  11.837  < 2e-16 ***
## hypertension       1.426e-01  1.928e-01   0.740 0.459374    
## heart_disease      1.444e-01  2.287e-01   0.631 0.527773    
## ever_married      -3.109e-01  2.467e-01  -1.260 0.207703    
## work_type         -5.852e-02  8.576e-02  -0.682 0.494976    
## Residence_type     7.476e-02  1.587e-01   0.471 0.637646    
## avg_glucose_level  4.131e-03  1.348e-03   3.064 0.002185 ** 
## bmi                3.406e-03  8.837e-04   3.855 0.000116 ***
## smoking_status    -4.120e-03  7.630e-02  -0.054 0.956939    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.6  on 3576  degrees of freedom
## Residual deviance: 1175.2  on 3565  degrees of freedom
## AIC: 1199.2
## 
## Number of Fisher Scoring iterations: 7
#4 most significant variables are age, bmi, avg_glucose_level, id. However, because id has no reality mearning, we only use first three variables in the next model.
#d
stroke.glm.2 = glm(stroke~age+bmi+avg_glucose_level, data = train.stroke, family = "binomial")
summary(stroke.glm.2)
## 
## Call:
## glm(formula = stroke ~ age + bmi + avg_glucose_level, family = "binomial", 
##     data = train.stroke)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2235  -0.3352  -0.1773  -0.0772   3.5985  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -8.2160633  0.4556835 -18.030  < 2e-16 ***
## age                0.0736033  0.0059148  12.444  < 2e-16 ***
## bmi                0.0032188  0.0008634   3.728 0.000193 ***
## avg_glucose_level  0.0042232  0.0013177   3.205 0.001351 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.6  on 3576  degrees of freedom
## Residual deviance: 1181.4  on 3573  degrees of freedom
## AIC: 1189.4
## 
## Number of Fisher Scoring iterations: 7
#e
stroke.cor_mat = cor(stroke.data)[, "stroke"]
stroke.cor_mat[stroke.cor_mat > 0.12]
##               age      hypertension     heart_disease avg_glucose_level 
##         0.2452573         0.1279038         0.1349140         0.1319454 
##            stroke 
##         1.0000000
corrplot(cor(stroke.data),method = "color", tl.col = "black")

stroke.glm.3 = glm(stroke~age+hypertension+heart_disease+avg_glucose_level, data = train.stroke, family = "binomial")
summary(stroke.glm.3)
## 
## Call:
## glm(formula = stroke ~ age + hypertension + heart_disease + avg_glucose_level, 
##     family = "binomial", data = train.stroke)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0341  -0.3343  -0.1824  -0.0867   3.7536  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -7.484405   0.408436 -18.325  < 2e-16 ***
## age                0.069782   0.005890  11.847  < 2e-16 ***
## hypertension       0.222470   0.189829   1.172 0.241220    
## heart_disease      0.170702   0.225282   0.758 0.448614    
## avg_glucose_level  0.004950   0.001315   3.765 0.000167 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.6  on 3576  degrees of freedom
## Residual deviance: 1192.7  on 3572  degrees of freedom
## AIC: 1202.7
## 
## Number of Fisher Scoring iterations: 7
#f
sum1 = summary(stroke.glm.1)
train_pred = predict(stroke.glm.1, data = train.stroke, type = "response")
test_pred = predict(stroke.glm.1, data = test.stroke, type = "response")

train.mse = mean((train_pred - train.stroke$stroke)^2)
test.mse = mean((test_pred - test.stroke$stroke)^2)
## Warning in test_pred - test.stroke$stroke: longer object length is not a
## multiple of shorter object length
RSS = sum((train_pred - train.stroke$stroke)^2)
starting_variance = sum((train.stroke$stroke - mean(train.stroke$stroke))^2)
train.r2 = 1 - RSS/starting_variance

train.mse
## [1] 0.04517531
test.mse
## [1] 0.05236293
train.r2
## [1] 0.1106369
#g
friend.bmi = stroke.data$bmi[which(dt$bmi == "17.7")[1]]
friend.smoking_status =
stroke.data$smoking_status[which(dt$smoking_status == "formerly smoked")[1]]
friend.data = data.frame(id = 00713, gender = 1, age = 20, hypertension = 0, heart_disease = 0, ever_married = 0, work_type = 1, Residence_type = 2, avg_glucose_level = 63.71, bmi = friend.bmi, smoking_status = friend.smoking_status)
friend.data
predict(stroke.glm.1, newdata = friend.data, type = "response") 
##           1 
## 0.002979697
ifelse(predict(stroke.glm.1, newdata = friend.data, type = "response") > 0.5, 
       "Have Stroke", "Will not Have Stroke")
##                      1 
## "Will not Have Stroke"