library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.4
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyverse)
library(dplyr)
library(tidyr)
For this project, I will be using the COVID19_line_list_data.csv.
covid<-read.csv("COVID19_line_list_data.csv", header = TRUE)
covid<-na.omit(covid)
str(covid)
## 'data.frame': 821 obs. of 11 variables:
## $ ï..case : int 1 2 3 4 5 6 7 8 9 10 ...
## $ reporting.date: Factor w/ 43 levels "2020-01-13","2020-01-15",..: 4 4 5 5 5 5 5 5 5 5 ...
## $ location : Factor w/ 156 levels "Afghanistan",..: 117 114 155 130 130 25 118 14 14 14 ...
## $ country : Factor w/ 38 levels "Afghanistan",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 1 2 2 2 2 ...
## $ age : num 66 56 46 60 58 44 34 37 39 56 ...
## $ visiting.Wuhan: int 1 0 0 1 0 0 0 1 1 1 ...
## $ from.Wuhan : int 0 1 1 0 0 1 1 0 0 0 ...
## $ death : int 0 0 0 0 0 0 0 0 0 0 ...
## $ recovered : int 0 0 0 0 0 0 0 0 0 0 ...
## $ symptom : Factor w/ 111 levels "","chest discomfort",..: 94 94 1 94 94 94 94 94 94 94 ...
## - attr(*, "na.action")= 'omit' Named int 23 92 117 146 147 148 170 171 172 173 ...
## ..- attr(*, "names")= chr "23" "92" "117" "146" ...
I need to turn these variables into useful ones for this project.
covid$age<-as.integer(covid$age)
covid$ï..case<-as.integer(covid$ï..case)
covid$visiting.Wuhan<-as.integer(covid$visiting.Wuhan)
Now I’m fitting a model of survival based on recovery of the subject if the subject visited Wuhan, China.
dom<-glm(recovered~log(ï..case), data = covid, family = "binomial")
summary(dom)
##
## Call:
## glm(formula = recovered ~ log(ï..case), family = "binomial",
## data = covid)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8330 -0.7147 -0.5649 -0.3352 2.0945
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.7472 0.8633 -6.657 2.79e-11 ***
## log(ï..case) 0.6964 0.1390 5.010 5.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 759.34 on 820 degrees of freedom
## Residual deviance: 725.31 on 819 degrees of freedom
## AIC: 729.31
##
## Number of Fisher Scoring iterations: 5
dom2<-glm(recovered~log(ï..case):visiting.Wuhan+gender, data = covid, family = "binomial")
summary(dom2)
##
## Call:
## glm(formula = recovered ~ log(ï..case):visiting.Wuhan + gender,
## family = "binomial", data = covid)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7977 -0.6155 -0.5788 -0.5788 1.9335
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.56744 0.14551 -10.772 <2e-16 ***
## gendermale -0.13432 0.18593 -0.722 0.4700
## log(ï..case):visiting.Wuhan 0.08439 0.04087 2.065 0.0389 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 759.34 on 820 degrees of freedom
## Residual deviance: 754.87 on 818 degrees of freedom
## AIC: 760.87
##
## Number of Fisher Scoring iterations: 4
Now, I’m going to create a confusion matrix of this.
pred1<-predict(dom, newdata = covid, type = "response")
conf_mat<-data.frame(recovered=covid$recovered, predRec=pred1>=1)%>%
group_by(recovered, predRec)%>%
summarise(n=n())
conf_mat
## # A tibble: 2 x 3
## # Groups: recovered [2]
## recovered predRec n
## <int> <lgl> <int>
## 1 0 FALSE 678
## 2 1 FALSE 143
pred2<-predict(dom2, newdata = covid, type = "response")
conf_mat2<-data.frame(recovered=covid$recovered, predRec=pred2>=1)%>%
group_by(recovered, predRec)%>%
summarise(n=n())
conf_mat2
## # A tibble: 2 x 3
## # Groups: recovered [2]
## recovered predRec n
## <int> <lgl> <int>
## 1 0 FALSE 678
## 2 1 FALSE 143
Looking at this, I can see that this tells me there is a higher chance of someone not recovering from COVID-19 if did not visit Wuhan. But I need to test and train the model too.
set.seed(34)
train.indices<-sample(1:nrow(covid), size = floor(nrow(covid)/2))
train.data<-covid%>%
slice(train.indices)
test.data<-covid%>%
slice(-train.indices)
#training
m1<-glm(recovered~log(ï..case), data = train.data, family = "binomial")
m2<-glm(recovered~log(ï..case):visiting.Wuhan+gender, data = train.data, family = "binomial")
#testing
test1<-predict(m1, newdata = test.data, type = "response")
test.mat1<-data.frame(recovered=test.data$recovered, predRec=test1>=1)%>%
group_by(recovered, predRec)%>%
summarise(n=n())
test.mat1
## # A tibble: 2 x 3
## # Groups: recovered [2]
## recovered predRec n
## <int> <lgl> <int>
## 1 0 FALSE 345
## 2 1 FALSE 66
test2<-predict(m2, newdata = test.data, type = "response")
test.mat2<-data.frame(recovered=test.data$recovered, predRec=test2>=1)%>%
group_by(recovered, predRec)%>%
summarise(n=n())
test.mat2
## # A tibble: 2 x 3
## # Groups: recovered [2]
## recovered predRec n
## <int> <lgl> <int>
## 1 0 FALSE 345
## 2 1 FALSE 66
By looking at these results, I can see that this model would predict that, again, most people would not recover from the coronavirus if they did not visit Wuhan. Based on the data provided, I would choose the first model because of the lack of data in some of the variables.