This is project for self learning purpose, reference from Tech Tribe. You can download datasets from here

Libraries

library(Hmisc) # for data analysis inc missing value handle, character manipulation
library(dplyr) # data manipulation
library(tidyverse) # data science lib
library(ggplot2) # data visualization
library(glue) # tooltip
library(plotly) # interactive visualization
library(rworldmap) # world map

Import Data

rm(list=ls()) # remove all variable stored previously
dt <- read.csv("C:/Users/Asus/Downloads/COVID19_line_list_data.csv")
head(dt)
##   id case_in_country reporting.date  X
## 1  1              NA      1/20/2020 NA
## 2  2              NA      1/20/2020 NA
## 3  3              NA      1/21/2020 NA
## 4  4              NA      1/21/2020 NA
## 5  5              NA      1/21/2020 NA
## 6  6              NA      1/21/2020 NA
##                                                                                                                                                                                                                                                                                                                                                                                                                summary
## 1 First confirmed imported COVID-19 pneumonia patient in Shenzhen (from Wuhan): male, 66, shenzheng residence, visited relatives in Wuhan on 12/29/2019, symptoms onset on 01/03/2020, returned to Shenzhen and seek medical care on 01/04/2020, hospitalized on 01/11/2020, sample sent to China CDC for testing on 01/18/2020, confirmed on 01/19/2020. 8 others under medical observation, contact tracing ongoing.
## 2                                                                                                                                                                    First confirmed imported COVID-19 pneumonia patient in Shanghai (from Wuhan): female, 56, Wuhan residence, arrived in Shanghai from Wuhan on 01/12/2020, symptom onset and visited fever clinic on 01/15/2020, laboratory confirmed on 01/20/2020
## 3                                                                                                                                        First confirmed imported cases in Zhejiang: patient is male, 46, lives in Wuhan, self-driving from Wuhan to Hangzhou on 01/03/2020, symptom onset 01/04/2020, hospitalized on 01/17/2020, sample deliver to China CDC for testing on 01/20/2020, test positive on 01/21/2020.
## 4                                                                                                                                                                                                                                            new confirmed imported COVID-19 pneumonia in Tianjin: female, age 60, recently visited Wuhan, visited fever clinic on 01/19/2020 in Tianjin then quarantined immediately.
## 5                                                                                                                                                                                                                                                                                                              new confirmed imported COVID-19 pneumonia in Tianjin: male, age 58, visited fever clinic on 01/14/2020.
## 6                                                                                                                                                                                                                                                     First confirmed imported COVID-19 pneumonia patient in Chongqing (from Wuhan): female, age 44, symptoms onset on 01/15/2020, laboratory confirmed on 01/21/2020.
##              location country gender age symptom_onset If_onset_approximated
## 1 Shenzhen, Guangdong   China   male  66      01/03/20                     0
## 2            Shanghai   China female  56     1/15/2020                     0
## 3            Zhejiang   China   male  46      01/04/20                     0
## 4             Tianjin   China female  60          <NA>                    NA
## 5             Tianjin   China   male  58          <NA>                    NA
## 6           Chongqing   China female  44     1/15/2020                     0
##   hosp_visit_date exposure_start exposure_end visiting.Wuhan from.Wuhan death
## 1        01/11/20     12/29/2019     01/04/20              1          0     0
## 2       1/15/2020           <NA>     01/12/20              0          1     0
## 3       1/17/2020           <NA>     01/03/20              0          1     0
## 4       1/19/2020           <NA>         <NA>              1          0     0
## 5       1/14/2020           <NA>         <NA>              0          0     0
## 6            <NA>           <NA>         <NA>              0          1     0
##   recovered symptom                                                 source
## 1         0                           Shenzhen Municipal Health Commission
## 2         0         Official Weibo of Shanghai Municipal Health Commission
## 3         0                         Health Commission of Zhejiang Province
## 4         0                                               人民日报官方微博
## 5         0                                               人民日报官方微博
## 6         0                          Chongqing Municipal Health Commission
##                                                                                                             link
## 1                                                         http://wjw.sz.gov.cn/wzx/202001/t20200120_18987787.htm
## 2 https://www.weibo.com/2372649470/IqogQhgfa?from=page_1001062372649470_profile&wvr=6&mod=weibotime&type=comment
## 3                                                http://www.zjwjw.gov.cn/art/2020/1/21/art_1202101_41786033.html
## 4                                                                    https://m.weibo.cn/status/4463235401268457?
## 5                                                                    https://m.weibo.cn/status/4463235401268457?
## 6                                                               http://wsjkw.cq.gov.cn/tzgg/20200121/249730.html
##   X.1 X.2 X.3 X.4 X.5 X.6
## 1  NA  NA  NA  NA  NA  NA
## 2  NA  NA  NA  NA  NA  NA
## 3  NA  NA  NA  NA  NA  NA
## 4  NA  NA  NA  NA  NA  NA
## 5  NA  NA  NA  NA  NA  NA
## 6  NA  NA  NA  NA  NA  NA

data wrangling and visualization

dim(dt)
## [1] 1085   27
colnames(dt)
##  [1] "id"                    "case_in_country"       "reporting.date"       
##  [4] "X"                     "summary"               "location"             
##  [7] "country"               "gender"                "age"                  
## [10] "symptom_onset"         "If_onset_approximated" "hosp_visit_date"      
## [13] "exposure_start"        "exposure_end"          "visiting.Wuhan"       
## [16] "from.Wuhan"            "death"                 "recovered"            
## [19] "symptom"               "source"                "link"                 
## [22] "X.1"                   "X.2"                   "X.3"                  
## [25] "X.4"                   "X.5"                   "X.6"
# remove unused column
dt <- dt %>% select(-c( "case_in_country","X","X.1","X.2","X.3","X.4","X.5","X.6","source"))
colSums(is.na(dt))
##                    id        reporting.date               summary 
##                     0                     1                     5 
##              location               country                gender 
##                     0                     0                   183 
##                   age         symptom_onset If_onset_approximated 
##                   242                   522                   525 
##       hosp_visit_date        exposure_start          exposure_end 
##                   577                   957                   744 
##        visiting.Wuhan            from.Wuhan                 death 
##                     0                     4                     0 
##             recovered               symptom                  link 
##                     0                     0                     0
sum(duplicated(dt))
## [1] 0

Our data has no duplicate data but has missing value. You can also see the data information by describe().

Data Wrangling and Visualization

# death rate
dt$death_count <- as.integer(dt$death != 0) # only take 1 value (died), if its true set it to 1
sum(dt$death_count)/nrow(dt)
## [1] 0.05806452

Death rate is about 0.05%

# which age most died
char1 <- dt %>%
  filter(death == 1) %>%
  group_by(age) %>%
  summarise(total = n()) %>%  
  arrange(desc(total)) %>% 
  ungroup() %>% 
  head(5) %>% 
  mutate(label = paste("Age:", age, "\nTotal:", total, "People")) %>%  

  ggplot(aes(x = reorder(as.factor(age), -total), y = total, 
             fill = as.factor(age), text = label)) +  
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Most Common Ages of Death Due to COVID-19",
       x = "Age (Years Old)",
       y = "Total Deaths") +
  theme_classic() +
  theme(legend.position = "none")

ggplotly(char1, tooltip = "text") 

Older people can easily infected by corona and most of them were died.

# which gender most died
male = subset(dt, gender   == "male")
female = subset(dt, gender == "female")
mean(male$death_count) 
## [1] 0.08461538
mean(female$death_count)
## [1] 0.03664921
by_gender <- rbind(male, female)

char2 <- by_gender %>% group_by(gender) %>% summarise(total = n()) %>% ungroup() %>% mutate(label =paste("Gender:", gender, "\nTotal:", total, "People")) %>% ggplot(aes(x = as.factor(gender), y = total, 
             fill = as.factor(gender), text = label)) +  
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Death by Gender",
       x = "Gender",
       y = "Total Deaths") 

ggplotly(char2, tooltip = "text")

Men are more likely died when they affected by corona virus.

# total cases per country
case_country <- dt %>% group_by(country) %>% summarise(total = n()) %>% arrange(desc(total))
case_country
## # A tibble: 38 × 2
##    country     total
##    <chr>       <int>
##  1 China         197
##  2 Japan         190
##  3 South Korea   114
##  4 Hong Kong      94
##  5 Singapore      93
##  6 Germany        54
##  7 Thailand       41
##  8 France         39
##  9 Spain          34
## 10 Taiwan         34
## # ℹ 28 more rows
# Join data with map
map_data <- joinCountryData2Map(case_country, joinCode = "NAME", nameJoinColumn = "country")
## 36 codes from your data successfully matched countries in the map
## 2 codes from your data failed to match with a country code in the map
## 207 codes from the map weren't represented in your data
# Plot the map
mapCountryData(map_data, nameColumnToPlot = "total", 
               mapTitle = "COVID-19 Cases by Country", 
               colourPalette = "heat", catMethod = "fixedWidth")

The most infected in the early 2020 is in East Asian such China, Japan, and Soouth Korea.