library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.4
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidyverse)
library(dplyr)
library(tidyr)

Loading in COVID-19

For this project, I will be using the COVID19_line_list_data.csv.

covid<-read.csv("COVID19_line_list_data.csv", header = TRUE)
covid<-na.omit(covid)
str(covid)
## 'data.frame':    821 obs. of  11 variables:
##  $ ï..case       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ reporting.date: Factor w/ 43 levels "2020-01-13","2020-01-15",..: 4 4 5 5 5 5 5 5 5 5 ...
##  $ location      : Factor w/ 156 levels "Afghanistan",..: 117 114 155 130 130 25 118 14 14 14 ...
##  $ country       : Factor w/ 38 levels "Afghanistan",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ gender        : Factor w/ 2 levels "female","male": 2 1 2 1 2 1 2 2 2 2 ...
##  $ age           : num  66 56 46 60 58 44 34 37 39 56 ...
##  $ visiting.Wuhan: int  1 0 0 1 0 0 0 1 1 1 ...
##  $ from.Wuhan    : int  0 1 1 0 0 1 1 0 0 0 ...
##  $ death         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ recovered     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ symptom       : Factor w/ 111 levels "","chest discomfort",..: 94 94 1 94 94 94 94 94 94 94 ...
##  - attr(*, "na.action")= 'omit' Named int  23 92 117 146 147 148 170 171 172 173 ...
##   ..- attr(*, "names")= chr  "23" "92" "117" "146" ...

I need to turn these variables into useful ones for this project.

covid$age<-as.integer(covid$age)
covid$ï..case<-as.integer(covid$ï..case)
covid$visiting.Wuhan<-as.integer(covid$visiting.Wuhan)

Now I’m fitting a model of survival based on recovery of the subject if the subject visited Wuhan, China.

dom<-glm(recovered~log(ï..case), data = covid, family = "binomial")
summary(dom)
## 
## Call:
## glm(formula = recovered ~ log(ï..case), family = "binomial", 
##     data = covid)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8330  -0.7147  -0.5649  -0.3352   2.0945  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.7472     0.8633  -6.657 2.79e-11 ***
## log(ï..case)   0.6964     0.1390   5.010 5.45e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 759.34  on 820  degrees of freedom
## Residual deviance: 725.31  on 819  degrees of freedom
## AIC: 729.31
## 
## Number of Fisher Scoring iterations: 5
dom2<-glm(recovered~log(ï..case):visiting.Wuhan+gender, data = covid, family = "binomial")
summary(dom2)
## 
## Call:
## glm(formula = recovered ~ log(ï..case):visiting.Wuhan + gender, 
##     family = "binomial", data = covid)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7977  -0.6155  -0.5788  -0.5788   1.9335  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.56744    0.14551 -10.772   <2e-16 ***
## gendermale                  -0.13432    0.18593  -0.722   0.4700    
## log(ï..case):visiting.Wuhan  0.08439    0.04087   2.065   0.0389 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 759.34  on 820  degrees of freedom
## Residual deviance: 754.87  on 818  degrees of freedom
## AIC: 760.87
## 
## Number of Fisher Scoring iterations: 4

Now, I’m going to create a confusion matrix of this.

pred1<-predict(dom, newdata = covid, type = "response")
conf_mat<-data.frame(recovered=covid$recovered, predRec=pred1>=1)%>%
  group_by(recovered, predRec)%>%
  summarise(n=n())

conf_mat
## # A tibble: 2 x 3
## # Groups:   recovered [2]
##   recovered predRec     n
##       <int> <lgl>   <int>
## 1         0 FALSE     678
## 2         1 FALSE     143
pred2<-predict(dom2, newdata = covid, type = "response")
conf_mat2<-data.frame(recovered=covid$recovered, predRec=pred2>=1)%>%
  group_by(recovered, predRec)%>%
  summarise(n=n())
conf_mat2
## # A tibble: 2 x 3
## # Groups:   recovered [2]
##   recovered predRec     n
##       <int> <lgl>   <int>
## 1         0 FALSE     678
## 2         1 FALSE     143

Looking at this, I can see that this tells me there is a higher chance of someone not recovering from COVID-19 if did not visit Wuhan. But I need to test and train the model too.

set.seed(34)
train.indices<-sample(1:nrow(covid), size = floor(nrow(covid)/2))
train.data<-covid%>%
  slice(train.indices)
test.data<-covid%>%
  slice(-train.indices)

#training
m1<-glm(recovered~log(ï..case), data = train.data, family = "binomial")
m2<-glm(recovered~log(ï..case):visiting.Wuhan+gender, data = train.data, family = "binomial")

#testing
test1<-predict(m1, newdata = test.data, type = "response")
test.mat1<-data.frame(recovered=test.data$recovered, predRec=test1>=1)%>%
  group_by(recovered, predRec)%>%
  summarise(n=n())
test.mat1
## # A tibble: 2 x 3
## # Groups:   recovered [2]
##   recovered predRec     n
##       <int> <lgl>   <int>
## 1         0 FALSE     345
## 2         1 FALSE      66
test2<-predict(m2, newdata = test.data, type = "response")
test.mat2<-data.frame(recovered=test.data$recovered, predRec=test2>=1)%>%
  group_by(recovered, predRec)%>%
  summarise(n=n())
test.mat2
## # A tibble: 2 x 3
## # Groups:   recovered [2]
##   recovered predRec     n
##       <int> <lgl>   <int>
## 1         0 FALSE     345
## 2         1 FALSE      66

By looking at these results, I can see that this model would predict that, again, most people would not recover from the coronavirus if they did not visit Wuhan. Based on the data provided, I would choose the first model because of the lack of data in some of the variables.