Part 3 - stats 239 Alex Matteson
library(tidyverse)
#install.packages("ISLR")
library(ISLR)
#install.packages("tree")
library(tree)
library(readr)
COVID19_Data <- read_csv("Downloads/novel-corona-virus-2019-dataset/COVID19_line_list_data.csv")
View(COVID19_line_list_data)
head(COVID19_Data)
Make all dates in recovered and death = 1, so that we have a dummy variable for death or recovered. Also make variable death to death a numeric
Here are my logistic regression models. One is simpler and one is more complex.
#models
m1 <- glm(death ~ age, data= COVID19_Data, family = "binomial")
summary(m1)
Call:
glm(formula = death ~ age, family = "binomial", data = COVID19_Data)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.3158 -0.3893 -0.2260 -0.1301 2.9871
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.31140 0.71990 -10.16 < 2e-16 ***
age 0.07949 0.01058 7.51 5.91e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 422.39 on 842 degrees of freedom
Residual deviance: 344.53 on 841 degrees of freedom
(242 observations deleted due to missingness)
AIC: 348.53
Number of Fisher Scoring iterations: 6
m2 <- glm(death ~ age + gender + `from Wuhan`, data= COVID19_Data, family = "binomial")
summary(m2)
Call:
glm(formula = death ~ age + gender + `from Wuhan`, family = "binomial",
data = COVID19_Data)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.3302 -0.3001 -0.1678 -0.0807 3.1749
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -9.24125 0.92577 -9.982 < 2e-16 ***
age 0.08642 0.01214 7.119 1.09e-12 ***
gendermale 1.09658 0.35593 3.081 0.00206 **
`from Wuhan` 2.27477 0.32834 6.928 4.27e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 419.21 on 820 degrees of freedom
Residual deviance: 281.95 on 817 degrees of freedom
(264 observations deleted due to missingness)
AIC: 289.95
Number of Fisher Scoring iterations: 7
Confusion matrixs:
#Confusion Matrixs
#m1
pred1 <- predict(m1, newdata = COVID19_Data, type = "response")
conf_mat1<-data.frame(death=COVID19_Data$death, preddeath=pred1>.5)%>%
group_by(death, preddeath)%>%
summarise(n=n())
conf_mat1
#m2
pred2<-predict(m2, newdata = COVID19_Data, type = "response")
conf_mat2<-data.frame(death=COVID19_Data$death, preddeath=pred2>.5)%>%
group_by(death, preddeath)%>%
summarise(n=n())
conf_mat2
Testing and Training: Divide the data into testing and training sets.
#divide data
set.seed(500)
train_indices <- sample(1:nrow(COVID19_Data), size = floor(nrow(COVID19_Data)/2))
train_data <- COVID19_Data %>%
slice(train_indices)
test_data <- COVID19_Data %>%
slice(-train_indices)
Training:
#train
m1 <- glm(death ~ age, data= train_data, family = "binomial")
m2 <- glm(death ~ age + gender + `from Wuhan`, data= train_data, family = "binomial")
Testing and confusion matrixs:
#1
test1 <- predict(m1, newdata = test_data, type = "response")
test_mat1<-data.frame(death=test_data$death, preddeath=test1>.5)%>%
group_by(death, preddeath)%>%
summarise(n=n())
test_mat1
#2
test2<-predict(m2, newdata = test_data, type = "response")
test_mat2<-data.frame(death=test_data$death, preddeath=test2>.5)%>%
group_by(death, preddeath)%>%
summarise(n=n())
test_mat2
NA