rm(list = ls())
library("Hmisc")
## Le chargement a nécessité le package : lattice
## Le chargement a nécessité le package : survival
## Le chargement a nécessité le package : Formula
## Le chargement a nécessité le package : ggplot2
##
## Attachement du package : 'Hmisc'
## Les objets suivants sont masqués depuis 'package:base':
##
## format.pval, units
dat <- read.csv("C:/Users/LE Thi Tuong/Desktop/CV and cover letter template/Real Data Engineering/R project/A test session with R_01.07.2022/COVID_R/COVID19_line_list_data.csv")
head(dat)
## id case_in_country reporting.date X
## 1 1 NA 1/20/2020 NA
## 2 2 NA 1/20/2020 NA
## 3 3 NA 1/21/2020 NA
## 4 4 NA 1/21/2020 NA
## 5 5 NA 1/21/2020 NA
## 6 6 NA 1/21/2020 NA
## summary
## 1 First confirmed imported COVID-19 pneumonia patient in Shenzhen (from Wuhan): male, 66, shenzheng residence, visited relatives in Wuhan on 12/29/2019, symptoms onset on 01/03/2020, returned to Shenzhen and seek medical care on 01/04/2020, hospitalized on 01/11/2020, sample sent to China CDC for testing on 01/18/2020, confirmed on 01/19/2020. 8 others under medical observation, contact tracing ongoing.
## 2 First confirmed imported COVID-19 pneumonia patient in Shanghai (from Wuhan): female, 56, Wuhan residence, arrived in Shanghai from Wuhan on 01/12/2020, symptom onset and visited fever clinic on 01/15/2020, laboratory confirmed on 01/20/2020
## 3 First confirmed imported cases in Zhejiang: patient is male, 46, lives in Wuhan, self-driving from Wuhan to Hangzhou on 01/03/2020, symptom onset 01/04/2020, hospitalized on 01/17/2020, sample deliver to China CDC for testing on 01/20/2020, test positive on 01/21/2020.
## 4 new confirmed imported COVID-19 pneumonia in Tianjin: female, age 60, recently visited Wuhan, visited fever clinic on 01/19/2020 in Tianjin then quarantined immediately.
## 5 new confirmed imported COVID-19 pneumonia in Tianjin: male, age 58, visited fever clinic on 01/14/2020.
## 6 First confirmed imported COVID-19 pneumonia patient in Chongqing (from Wuhan): female, age 44, symptoms onset on 01/15/2020, laboratory confirmed on 01/21/2020.
## location country gender age symptom_onset If_onset_approximated
## 1 Shenzhen, Guangdong China male 66 01/03/20 0
## 2 Shanghai China female 56 1/15/2020 0
## 3 Zhejiang China male 46 01/04/20 0
## 4 Tianjin China female 60 <NA> NA
## 5 Tianjin China male 58 <NA> NA
## 6 Chongqing China female 44 1/15/2020 0
## hosp_visit_date exposure_start exposure_end visiting.Wuhan from.Wuhan death
## 1 01/11/20 12/29/2019 01/04/20 1 0 0
## 2 1/15/2020 <NA> 01/12/20 0 1 0
## 3 1/17/2020 <NA> 01/03/20 0 1 0
## 4 1/19/2020 <NA> <NA> 1 0 0
## 5 1/14/2020 <NA> <NA> 0 0 0
## 6 <NA> <NA> <NA> 0 1 0
## recovered symptom source
## 1 0 Shenzhen Municipal Health Commission
## 2 0 Official Weibo of Shanghai Municipal Health Commission
## 3 0 Health Commission of Zhejiang Province
## 4 0 人民日报官方微博
## 5 0 人民日报官方微博
## 6 0 Chongqing Municipal Health Commission
## link
## 1 http://wjw.sz.gov.cn/wzx/202001/t20200120_18987787.htm
## 2 https://www.weibo.com/2372649470/IqogQhgfa?from=page_1001062372649470_profile&wvr=6&mod=weibotime&type=comment
## 3 http://www.zjwjw.gov.cn/art/2020/1/21/art_1202101_41786033.html
## 4 https://m.weibo.cn/status/4463235401268457?
## 5 https://m.weibo.cn/status/4463235401268457?
## 6 http://wsjkw.cq.gov.cn/tzgg/20200121/249730.html
## X.1 X.2 X.3 X.4 X.5 X.6
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA NA
## 3 NA NA NA NA NA NA
## 4 NA NA NA NA NA NA
## 5 NA NA NA NA NA NA
## 6 NA NA NA NA NA NA
dat1 = dat [, c("gender", "age", "death")]
head(dat1)
## gender age death
## 1 male 66 0
## 2 female 56 0
## 3 male 46 0
## 4 female 60 0
## 5 male 58 0
## 6 female 44 0
describe(dat1)
## dat1
##
## 3 Variables 1085 Observations
## --------------------------------------------------------------------------------
## gender
## n missing distinct
## 902 183 2
##
## Value female male
## Frequency 382 520
## Proportion 0.424 0.576
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 843 242 85 0.999 49.48 20.79 22.0 25.0
## .25 .50 .75 .90 .95
## 35.0 51.0 64.0 75.0 78.9
##
## lowest : 0.25 0.50 1.00 2.00 4.00, highest: 86.00 87.00 89.00 91.00 96.00
## --------------------------------------------------------------------------------
## death
## n missing distinct
## 1085 0 14
##
## lowest : 0 02/01/20 1 2/13/2020 2/14/2020
## highest: 2/24/2020 2/25/2020 2/26/2020 2/27/2020 2/28/2020
##
## 0 (1022, 0.942), 02/01/20 (1, 0.001), 1 (42, 0.039), 2/13/2020 (1, 0.001),
## 2/14/2020 (1, 0.001), 2/19/2020 (2, 0.002), 2/21/2020 (2, 0.002), 2/22/2020 (1,
## 0.001), 2/23/2020 (4, 0.004), 2/24/2020 (1, 0.001), 2/25/2020 (2, 0.002),
## 2/26/2020 (3, 0.003), 2/27/2020 (2, 0.002), 2/28/2020 (1, 0.001)
## --------------------------------------------------------------------------------
dat1$death1 <- as.integer(dat1$death != 0)
unique(dat1$death1)
## [1] 0 1
sum(dat1$death1)/ nrow(dat1)
## [1] 0.05806452
dead = subset(dat1, death1 == 1)
alive = subset(dat1, death1 == 0)
mean(dead$age, na.rm = TRUE)
## [1] 68.58621
mean(alive$age, na.rm = TRUE)
## [1] 48.07229
t.test(alive$age, dead$age, alternative = "two.sided", conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: alive$age and dead$age
## t = -10.839, df = 72.234, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -24.28669 -16.74114
## sample estimates:
## mean of x mean of y
## 48.07229 68.58621
men = subset(dat1, gender == "male")
women = subset(dat1, gender == "female")
mean(men$death1, na.rm=TRUE)
## [1] 0.08461538
mean(women$death1, na.rm=TRUE)
## [1] 0.03664921
t.test(men$death1, women$death1, alternative = "two.sided", conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: men$death1 and women$death1
## t = 3.084, df = 894.06, p-value = 0.002105
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.01744083 0.07849151
## sample estimates:
## mean of x mean of y
## 0.08461538 0.03664921
library("table1")
##
## Attachement du package : 'table1'
## Les objets suivants sont masqués depuis 'package:Hmisc':
##
## label, label<-, units
## Les objets suivants sont masqués depuis 'package:base':
##
## units, units<-
table1(~ gender + age + death1, data = dat1)
| Overall (N=1085) |
|
|---|---|
| gender | |
| female | 382 (35.2%) |
| male | 520 (47.9%) |
| Missing | 183 (16.9%) |
| age | |
| Mean (SD) | 49.5 (18.3) |
| Median [Min, Max] | 51.0 [0.250, 96.0] |
| Missing | 242 (22.3%) |
| death1 | |
| Mean (SD) | 0.0581 (0.234) |
| Median [Min, Max] | 0 [0, 1.00] |
table1(~ factor(death1) + age | gender, data = dat1)
| female (N=382) |
male (N=520) |
Overall (N=1085) |
|
|---|---|---|---|
| factor(death1) | |||
| 0 | 368 (96.3%) | 476 (91.5%) | 1022 (94.2%) |
| 1 | 14 (3.7%) | 44 (8.5%) | 63 (5.8%) |
| age | |||
| Mean (SD) | 49.6 (18.2) | 49.8 (17.9) | 49.5 (18.3) |
| Median [Min, Max] | 52.0 [2.00, 96.0] | 50.5 [0.500, 89.0] | 51.0 [0.250, 96.0] |
| Missing | 33 (8.6%) | 44 (8.5%) | 242 (22.3%) |
library("explore")
##
## Attachement du package : 'explore'
## L'objet suivant est masqué depuis 'package:Hmisc':
##
## describe
explore_all(dat1)
explore_all(dat1, target = gender)
explore_all(dat1, target = age)
### Graphical analysis with ggplot2
library("ggplot2")
library(gridExtra)
p1 = ggplot(data = dat1, aes(x = age, col = death1)) + geom_histogram()
p2 = ggplot(data = dat1, aes(x = age, col = death1)) + geom_histogram(col= "white", fill = "blue")
p3 = ggplot(data = dat1, aes(x = age, col = death1)) + geom_histogram(aes(y=..density..), col= "white", fill = "blue") + geom_density(col = "red")
grid.arrange(p1, p2, p3, ncol = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 242 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 242 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 242 rows containing non-finite values (stat_bin).
## Warning: Removed 242 rows containing non-finite values (stat_density).