hw_01_yourname.Rmd and use it for your solutions.Only include necessary code to answer the questions.
Most of the functions you use should be from the tidyverse. Unnecessary Base R or other packages not covered in class will result in point deductions.
Use Pull requests and or email to ask me any questions. If you email, please ensure your most recent code is pushed to GitHub.
Learning Outcome
Context
url_in to store this URL: “https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series”# Installed library
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
df with a variable called file_names with a row for each of the following four file names to be loaded from the URL:
df <- tibble(file_names = c("time_series_covid19_confirmed_global.csv",
"time_series_covid19_deaths_global.csv",
"time_series_covid19_confirmed_US.csv",
"time_series_covid19_deaths_US.csv")) -> df
url that puts url_in on the front of each file_name to create a complete URL.df %>%
mutate(url = str_c(url_in, file_names, sep = "")) -> df
mutate() with map() to create a list column called data with each row holding the downloaded data frame for each file name.df %>%
mutate(data = map(url, ~read_csv(., na = ""))) -> df
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## `Province/State` = col_character(),
## `Country/Region` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## iso2 = col_character(),
## iso3 = col_character(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## iso2 = col_character(),
## iso3 = col_character(),
## Admin2 = col_character(),
## Province_State = col_character(),
## Country_Region = col_character(),
## Combined_Key = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
df called "case_types" with the unique portions of the file names.df %>%
mutate(case_types = as.factor(str_extract(file_names, "[:alpha:]*_[gU][:alpha:]*"))) ->
df
# alpha = Any letter, [A-Za-z]
# reference: https://www.petefreitag.com/cheatsheets/regex/character-classes/
case_types and data from df.df should have four observations of two variables.df %>%
select(case_types, data) -> df
map() to add the names from each of the four data frames to a new variable in df called vars and visually compare them to identify issues.df %>%
mutate(vars = map(df$data, names)) -> df
# map(df$vars, ~unlist(.)[1:15]) for checking
fix_names() which takes three arguments, a data frame, a string, and a replacement pattern. It should replace all occurrences of the string in the names of the variables in the data frame with the replacement pattern.Population and County to the data frames where missing.Country_State that combines the country with the province/state while keeping the original columns.df$vars when complete to check for consistency.map_if()# a
fix_names <- function(df, pattern, rePattern){
stopifnot(is.data.frame(df), is.character(pattern), is.character(rePattern))
names(df) <- str_replace_all(names(df), pattern, rePattern)
return(df)
}
# b-f
df %>%
mutate(data = map(data, ~fix_names(., "([ey])/", "\\1_")),
data = map(data, ~fix_names(., "Admin2", "County")),
data = map(data, ~fix_names(., "Long_", "Long")),
data = map_if(data, str_detect(df$case_types, "US"),
~select(., -c("UID", "iso2", "iso3",
"code3", "FIPS", "Combined_Key"))),
data = map_if(data, str_detect(df$case_types, "global"),
~mutate(., County = "NA")),
data = map_if(data, !str_detect(df$case_types, "deaths_US"),
~mutate(., Population = 0)),
data = map(data, ~unite(., "Country_State",
c("Country_Region", "Province_State"),
remove = FALSE, na.rm = TRUE,
sep = "_"))
) -> df
# g
df %>%
mutate(vars = map(df$data, names)) -> df # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking
map() along with pivot_longer to tidy each data frame and as part of the pivot, ensure the daily values are in a variable called “Date” and use a lubridate function inside the pivot to ensure it is of class date.df_longlibrary(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
df %>%
mutate(data = map(data, ~pivot_longer(data = ., cols = contains("/"),
names_to = "Date",
values_to = "dailyValues"))
) -> df
# df$vars <- map(df$data, names) # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking
# crate a function to fix in type of Date
mdyDate <- function(df, varsDate){
stopifnot(is.data.frame(df), is.character(varsDate))
df[[varsDate]] <- mdy(df[[varsDate]])
return(df)
}
df %>%
mutate(data = map(data, ~mdyDate(., "Date"))) -> df_long
# str(df_long) # check the data set
map() to add a new variable called Continent to each data frame.countrycode::countrycodelibrary(countrycode)
df_long %>%
mutate(data = map(data, ~mutate(., Continent = countrycode(Country_Region,
origin = "country.name",
destination = "continent")))
) -> df_long
map() with case_when() to replace the NAs due to “Diamond Princess”, “Kosovo”, “MS Zaandam” with the most appropriate continentmap() with unique() to confirm five continents in the global data frames and one in the US data framesdf_long %>%
mutate(data = map(data, ~mutate(., Continent = case_when(
Country_Region == "Diamond Princess" ~ "Asia",
Country_Region == "Kosovo" ~ "Americas",
Country_Region == "MS Zaandam" ~ "Europe",
TRUE ~ Continent)
))) -> df_long
map(df_long$data, ~unique(.$Continent))
## [[1]]
## [1] "Asia" "Europe" "Africa" "Americas" "Oceania" NA
##
## [[2]]
## [1] "Asia" "Europe" "Africa" "Americas" "Oceania" NA
##
## [[3]]
## [1] "Americas"
##
## [[4]]
## [1] "Americas"
df_long and save into a new data frame called df_alldf and df_long dataframes from the environmentvars variable from df_all# 1
df_long %>%
unnest(cols = data) %>%
ungroup() -> df_all
# 2
remove(df, df_long)
# 3
df_all %>%
select(-vars) -> df_all
df_popdf_all data frame.df_pop# 1
df_pop <- read_csv("./data/WPP2019_TotalPopulation.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## LocID = col_double(),
## Location = col_character(),
## PopTotal = col_double(),
## PopDensity = col_double()
## )
# summarize(df_pop, across(everything(), ~sum(is.na(.)))) # check NAs
# 2
semi_join(df_pop, df_all, by = c("Location" = "Country_Region")) -> df_pop
# 3
df_pop %>%
mutate(rank_p = rank(-PopTotal, na.last = TRUE),
rank_d = rank(-PopDensity, na.last = TRUE),
PopTotal = (PopTotal*1000)) -> df_pop
df_alldf_pop to df_allPopulation for US CountiesPopTotal for the country leveldf_all %>%
inner_join(df_pop, by = c("Country_Region" = "Location")) -> df_all
df_all
## # A tibble: 3,483,832 x 16
## case_types Country_State Province_State Country_Region Lat Long County
## <fct> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## 2 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## 3 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## 4 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## 5 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## 6 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## 7 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## 8 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## 9 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## 10 confirmed_glo… Afghanistan <NA> Afghanistan 33.9 67.7 NA
## # … with 3,483,822 more rows, and 9 more variables: Population <dbl>,
## # Date <date>, dailyValues <dbl>, Continent <chr>, LocID <dbl>,
## # PopTotal <dbl>, PopDensity <dbl>, rank_p <dbl>, rank_d <dbl>
Country_Region, Continent case_type, rank_p and rank_d that summarizes the current totals and the totals as a percentage of total population.# 1
df_all %>%
group_by(Country_Region, Continent, case_types, rank_p, rank_d) %>%
summarise(ttlCases = max(dailyValues), ttlPerc = ttlCases/last(PopTotal)*100) %>%
ungroup() -> tmp
## `summarise()` has grouped output by 'Country_Region', 'Continent', 'case_types', 'rank_p'. You can override using the `.groups` argument.
# 2
## Top 20 Countries with the most confirmed cases and the percentage effects
tmp %>%
filter(case_types == "confirmed_global") %>%
arrange(desc(ttlCases)) %>%
head(20) -> confirmed20
confirmed20
## # A tibble: 20 x 7
## Country_Region Continent case_types rank_p rank_d ttlCases ttlPerc
## <chr> <chr> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 US Americas confirmed_global 3 131 33026624 10.0
## 2 India Asia confirmed_global 2 14 25772440 1.89
## 3 Brazil Americas confirmed_global 6 139 15812055 7.49
## 4 France Europe confirmed_global 19 63 5863138 9.00
## 5 Turkey Asia confirmed_global 15 70 5151038 6.17
## 6 United Kingdom Europe confirmed_global 18 29 4452756 6.59
## 7 Italy Europe confirmed_global 20 45 4172525 6.89
## 8 Germany Europe confirmed_global 14 33 3627777 4.34
## 9 Spain Europe confirmed_global 24 83 3625928 7.76
## 10 Argentina Americas confirmed_global 25 156 3411160 7.62
## 11 Colombia Americas confirmed_global 23 125 3161126 6.28
## 12 Poland Europe confirmed_global 32 61 2859261 7.55
## 13 Iran Asia confirmed_global 16 118 2792204 3.37
## 14 Mexico Americas confirmed_global 9 107 2387512 1.87
## 15 Ukraine Europe confirmed_global 27 96 2222115 5.05
## 16 Peru Americas confirmed_global 37 138 1903615 5.86
## 17 Indonesia Asia confirmed_global 4 53 1753101 0.648
## 18 Czechia Europe confirmed_global 75 55 1655414 15.5
## 19 South Africa Africa confirmed_global 21 121 1621362 2.77
## 20 Netherlands Europe confirmed_global 59 13 1610868 9.42
# 3
## Top 20 Countries with the most died cases and the percentage effects
tmp %>%
filter(case_types == "deaths_global") %>%
arrange(desc(ttlCases)) %>%
head(20) -> deaths20
deaths20
## # A tibble: 20 x 7
## Country_Region Continent case_types rank_p rank_d ttlCases ttlPerc
## <chr> <chr> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 US Americas deaths_global 3 131 587874 0.179
## 2 Brazil Americas deaths_global 6 139 441691 0.209
## 3 India Asia deaths_global 2 14 287122 0.0210
## 4 Mexico Americas deaths_global 9 107 220850 0.173
## 5 United Kingdom Europe deaths_global 18 29 127694 0.189
## 6 Italy Europe deaths_global 20 45 124646 0.206
## 7 France Europe deaths_global 19 63 107390 0.165
## 8 Germany Europe deaths_global 14 33 86908 0.104
## 9 Colombia Americas deaths_global 23 125 82743 0.164
## 10 Spain Europe deaths_global 24 83 79568 0.170
## 11 Iran Asia deaths_global 16 118 77765 0.0938
## 12 Argentina Americas deaths_global 25 156 72265 0.161
## 13 Poland Europe deaths_global 32 61 72250 0.191
## 14 Peru Americas deaths_global 37 138 67034 0.206
## 15 South Africa Africa deaths_global 21 121 55507 0.0948
## 16 Ukraine Europe deaths_global 27 96 50658 0.115
## 17 Indonesia Asia deaths_global 4 53 48669 0.0180
## 18 Turkey Asia deaths_global 15 70 45419 0.0544
## 19 Czechia Europe deaths_global 75 55 29948 0.280
## 20 Romania Europe deaths_global 52 90 29716 0.153
tmp %>%
arrange(desc(ttlPerc)) %>%
head(20) -> perc20
perc20 %>%
# anti_join() return all rows from x without a match in y.
anti_join(confirmed20) %>%
anti_join(deaths20) %>%
select(Country_Region)
## Joining, by = c("Country_Region", "Continent", "case_types", "rank_p", "rank_d", "ttlCases", "ttlPerc")
## Joining, by = c("Country_Region", "Continent", "case_types", "rank_p", "rank_d", "ttlCases", "ttlPerc")
## # A tibble: 16 x 1
## Country_Region
## <chr>
## 1 Andorra
## 2 Montenegro
## 3 San Marino
## 4 Bahrain
## 5 Slovenia
## 6 Luxembourg
## 7 Sweden
## 8 Seychelles
## 9 Israel
## 10 Lithuania
## 11 Estonia
## 12 Maldives
## 13 Belgium
## 14 Panama
## 15 Croatia
## 16 Georgia
confirmed <- confirmed20$Country_Region
df_all %>%
filter(case_types == "confirmed_global", Country_State == confirmed) %>%
ggplot() +
geom_line(mapping = aes(x = Date, y = dailyValues, color = Country_State)) +
facet_wrap(~Continent) +
scale_y_log10() +
theme_bw() +
ylab("Cumulative Cases") +
ggtitle("The COVID-19 confirmed cases and timeline by Top 20 countries")
## Warning in Country_State == confirmed: longer object length is not a multiple of
## shorter object length
## Warning: Transformation introduced infinite values in continuous y-axis
2. Based on the plot of confirmed cases, we can see the deaths have positive association with confirmed cases of covid. It is note worthy that Turkey and Ecuador have a large number of deaths but they don’t have high confirmed cases.
deaths <- deaths20$Country_Region
df_all %>%
filter(case_types == "deaths_global", Country_State == deaths) %>%
ggplot() +
geom_line(mapping = aes(x = Date, y = dailyValues, color = Country_State)) +
facet_wrap(~Continent) +
scale_y_log10() +
theme_bw() +
ylab("Cumulative Deaths") +
ggtitle("The COVID-19 deaths and timeline by Top 20 countries")
## Warning in Country_State == deaths: longer object length is not a multiple of
## shorter object length
## Warning: Transformation introduced infinite values in continuous y-axis