This is project for self learning purpose, reference from Tech Tribe. You can download datasets from here
library(Hmisc) # for data analysis inc missing value handle, character manipulation
library(dplyr) # data manipulation
library(tidyverse) # data science lib
library(ggplot2) # data visualization
library(glue) # tooltip
library(plotly) # interactive visualization
library(rworldmap) # world map
rm(list=ls()) # remove all variable stored previously
dt <- read.csv("C:/Users/Asus/Downloads/COVID19_line_list_data.csv")
head(dt)
## id case_in_country reporting.date X
## 1 1 NA 1/20/2020 NA
## 2 2 NA 1/20/2020 NA
## 3 3 NA 1/21/2020 NA
## 4 4 NA 1/21/2020 NA
## 5 5 NA 1/21/2020 NA
## 6 6 NA 1/21/2020 NA
## summary
## 1 First confirmed imported COVID-19 pneumonia patient in Shenzhen (from Wuhan): male, 66, shenzheng residence, visited relatives in Wuhan on 12/29/2019, symptoms onset on 01/03/2020, returned to Shenzhen and seek medical care on 01/04/2020, hospitalized on 01/11/2020, sample sent to China CDC for testing on 01/18/2020, confirmed on 01/19/2020. 8 others under medical observation, contact tracing ongoing.
## 2 First confirmed imported COVID-19 pneumonia patient in Shanghai (from Wuhan): female, 56, Wuhan residence, arrived in Shanghai from Wuhan on 01/12/2020, symptom onset and visited fever clinic on 01/15/2020, laboratory confirmed on 01/20/2020
## 3 First confirmed imported cases in Zhejiang: patient is male, 46, lives in Wuhan, self-driving from Wuhan to Hangzhou on 01/03/2020, symptom onset 01/04/2020, hospitalized on 01/17/2020, sample deliver to China CDC for testing on 01/20/2020, test positive on 01/21/2020.
## 4 new confirmed imported COVID-19 pneumonia in Tianjin: female, age 60, recently visited Wuhan, visited fever clinic on 01/19/2020 in Tianjin then quarantined immediately.
## 5 new confirmed imported COVID-19 pneumonia in Tianjin: male, age 58, visited fever clinic on 01/14/2020.
## 6 First confirmed imported COVID-19 pneumonia patient in Chongqing (from Wuhan): female, age 44, symptoms onset on 01/15/2020, laboratory confirmed on 01/21/2020.
## location country gender age symptom_onset If_onset_approximated
## 1 Shenzhen, Guangdong China male 66 01/03/20 0
## 2 Shanghai China female 56 1/15/2020 0
## 3 Zhejiang China male 46 01/04/20 0
## 4 Tianjin China female 60 <NA> NA
## 5 Tianjin China male 58 <NA> NA
## 6 Chongqing China female 44 1/15/2020 0
## hosp_visit_date exposure_start exposure_end visiting.Wuhan from.Wuhan death
## 1 01/11/20 12/29/2019 01/04/20 1 0 0
## 2 1/15/2020 <NA> 01/12/20 0 1 0
## 3 1/17/2020 <NA> 01/03/20 0 1 0
## 4 1/19/2020 <NA> <NA> 1 0 0
## 5 1/14/2020 <NA> <NA> 0 0 0
## 6 <NA> <NA> <NA> 0 1 0
## recovered symptom source
## 1 0 Shenzhen Municipal Health Commission
## 2 0 Official Weibo of Shanghai Municipal Health Commission
## 3 0 Health Commission of Zhejiang Province
## 4 0 人民日报官方微博
## 5 0 人民日报官方微博
## 6 0 Chongqing Municipal Health Commission
## link
## 1 http://wjw.sz.gov.cn/wzx/202001/t20200120_18987787.htm
## 2 https://www.weibo.com/2372649470/IqogQhgfa?from=page_1001062372649470_profile&wvr=6&mod=weibotime&type=comment
## 3 http://www.zjwjw.gov.cn/art/2020/1/21/art_1202101_41786033.html
## 4 https://m.weibo.cn/status/4463235401268457?
## 5 https://m.weibo.cn/status/4463235401268457?
## 6 http://wsjkw.cq.gov.cn/tzgg/20200121/249730.html
## X.1 X.2 X.3 X.4 X.5 X.6
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA NA
## 3 NA NA NA NA NA NA
## 4 NA NA NA NA NA NA
## 5 NA NA NA NA NA NA
## 6 NA NA NA NA NA NA
dim(dt)
## [1] 1085 27
colnames(dt)
## [1] "id" "case_in_country" "reporting.date"
## [4] "X" "summary" "location"
## [7] "country" "gender" "age"
## [10] "symptom_onset" "If_onset_approximated" "hosp_visit_date"
## [13] "exposure_start" "exposure_end" "visiting.Wuhan"
## [16] "from.Wuhan" "death" "recovered"
## [19] "symptom" "source" "link"
## [22] "X.1" "X.2" "X.3"
## [25] "X.4" "X.5" "X.6"
# remove unused column
dt <- dt %>% select(-c( "case_in_country","X","X.1","X.2","X.3","X.4","X.5","X.6","source"))
colSums(is.na(dt))
## id reporting.date summary
## 0 1 5
## location country gender
## 0 0 183
## age symptom_onset If_onset_approximated
## 242 522 525
## hosp_visit_date exposure_start exposure_end
## 577 957 744
## visiting.Wuhan from.Wuhan death
## 0 4 0
## recovered symptom link
## 0 0 0
sum(duplicated(dt))
## [1] 0
Our data has no duplicate data but has missing value. You can also see the data information by describe().
# death rate
dt$death_count <- as.integer(dt$death != 0) # only take 1 value (died), if its true set it to 1
sum(dt$death_count)/nrow(dt)
## [1] 0.05806452
Death rate is about 0.05%
# which age most died
char1 <- dt %>%
filter(death == 1) %>%
group_by(age) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
ungroup() %>%
head(5) %>%
mutate(label = paste("Age:", age, "\nTotal:", total, "People")) %>%
ggplot(aes(x = reorder(as.factor(age), -total), y = total,
fill = as.factor(age), text = label)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Most Common Ages of Death Due to COVID-19",
x = "Age (Years Old)",
y = "Total Deaths") +
theme_classic() +
theme(legend.position = "none")
ggplotly(char1, tooltip = "text")
Older people can easily infected by corona and most of them were died.
# which gender most died
male = subset(dt, gender == "male")
female = subset(dt, gender == "female")
mean(male$death_count)
## [1] 0.08461538
mean(female$death_count)
## [1] 0.03664921
by_gender <- rbind(male, female)
char2 <- by_gender %>% group_by(gender) %>% summarise(total = n()) %>% ungroup() %>% mutate(label =paste("Gender:", gender, "\nTotal:", total, "People")) %>% ggplot(aes(x = as.factor(gender), y = total,
fill = as.factor(gender), text = label)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Death by Gender",
x = "Gender",
y = "Total Deaths")
ggplotly(char2, tooltip = "text")
Men are more likely died when they affected by corona virus.
# total cases per country
case_country <- dt %>% group_by(country) %>% summarise(total = n()) %>% arrange(desc(total))
case_country
## # A tibble: 38 × 2
## country total
## <chr> <int>
## 1 China 197
## 2 Japan 190
## 3 South Korea 114
## 4 Hong Kong 94
## 5 Singapore 93
## 6 Germany 54
## 7 Thailand 41
## 8 France 39
## 9 Spain 34
## 10 Taiwan 34
## # ℹ 28 more rows
# Join data with map
map_data <- joinCountryData2Map(case_country, joinCode = "NAME", nameJoinColumn = "country")
## 36 codes from your data successfully matched countries in the map
## 2 codes from your data failed to match with a country code in the map
## 207 codes from the map weren't represented in your data
# Plot the map
mapCountryData(map_data, nameColumnToPlot = "total",
mapTitle = "COVID-19 Cases by Country",
colourPalette = "heat", catMethod = "fixedWidth")
The most infected in the early 2020 is in East Asian such China, Japan, and Soouth Korea.