#install.packages("Hmisc")
#install.packages("ggbeeswarm")
#install.packages("skimr")
#install.packages("janitor")
#install.packages("ggdist")lab04
1 Lab 4 Deliverable
library(readxl)
GDP_data <- read_excel("data/gapminder_broadband_per_100.xlsx")
head(GDP_data)# A tibble: 6 × 15
Fixed broadband Internet…¹ `1998` `1999` `2000` `2001` `2002` `2003` `2004`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan NA NA NA 0 0 0 6.88e-4
2 Albania NA NA NA 0 0 0 0
3 Algeria NA NA NA 0 0 0.0564 1.11e-1
4 American Samoa NA NA NA 0 0 NA NA
5 Andorra NA NA NA NA 1.66 4.99 8.34e+0
6 Angola NA NA NA 0 0 0 0
# ℹ abbreviated name: ¹`Fixed broadband Internet subscribers (per 100 people)`
# ℹ 7 more variables: `2005` <dbl>, `2006` <dbl>, `2007` <dbl>, `2008` <dbl>,
# `2009` <dbl>, `2010` <dbl>, `2011` <lgl>
install.packages("WDI") Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)
install.packages("countrycode") Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)
1.1 Part 1: Tidying
library(readxl)
library(WDI)
library(countrycode)
library(dplyr)
library(tidyr)
library(janitor)
Attaching package: 'janitor'
The following objects are masked from 'package:stats':
chisq.test, fisher.test
GDP_data <- read_excel("data/gapminder_broadband_per_100.xlsx")
glimpse(GDP_data)Rows: 213
Columns: 15
$ `Fixed broadband Internet subscribers (per 100 people)` <chr> "Afghanistan",…
$ `1998` <dbl> NA, NA, NA, NA…
$ `1999` <dbl> NA, NA, NA, NA…
$ `2000` <dbl> NA, NA, NA, NA…
$ `2001` <dbl> 0.0000000000, …
$ `2002` <dbl> 0.0000000000, …
$ `2003` <dbl> 0.000000e+00, …
$ `2004` <dbl> 6.880265e-04, …
$ `2005` <dbl> 7.356639e-04, …
$ `2006` <dbl> 0.001625928, N…
$ `2007` <dbl> 0.001581161, 0…
$ `2008` <dbl> 0.001537626, 2…
$ `2009` <dbl> 0.00299058, 2.…
$ `2010` <dbl> 0.004362367, 3…
$ `2011` <lgl> NA, NA, NA, NA…
gdp_data_no_really_it_is <- WDI(
country = "all",
indicator = c(gdp_per_capita = "NY.GDP.PCAP.KD"),
start = 1998,
end = 2011,
extra = TRUE
) %>%
select(iso3c, year, gdp_per_capita, region)
head(gdp_data_no_really_it_is) iso3c year gdp_per_capita region
1 AFG 2011 525.4270 Middle East, North Africa, Afghanistan & Pakistan
2 AFG 2010 542.8710 Middle East, North Africa, Afghanistan & Pakistan
3 AFG 2009 488.8307 Middle East, North Africa, Afghanistan & Pakistan
4 AFG 2008 417.6473 Middle East, North Africa, Afghanistan & Pakistan
5 AFG 2007 410.7577 Middle East, North Africa, Afghanistan & Pakistan
6 AFG 2006 367.7583 Middle East, North Africa, Afghanistan & Pakistan
broadband_long <- GDP_data %>%
clean_names() %>%
rename(country = fixed_broadband_internet_subscribers_per_100_people) %>%
pivot_longer(
cols = matches("^x?\\d{4}$"),
names_to = "year",
values_to = "broadband_per_100"
) %>%
mutate(
year = as.numeric(gsub("^x", "", year)),
broadband_per_100 = as.numeric(broadband_per_100),
iso3c = countrycode(
country,
"country.name",
"iso3c",
custom_match = c(
"Channel Islands" = "CHI",
"Kosovo" = "XKX",
"Saint Martin" = "MAF"
)
)
)
# change iso3c = 3-letter country code, to match the names of the country as they are listed in the broadband_internet
head(broadband_long)# A tibble: 6 × 4
country year broadband_per_100 iso3c
<chr> <dbl> <dbl> <chr>
1 Afghanistan 1998 NA AFG
2 Afghanistan 1999 NA AFG
3 Afghanistan 2000 NA AFG
4 Afghanistan 2001 0 AFG
5 Afghanistan 2002 0 AFG
6 Afghanistan 2003 0 AFG
GDP_long <- broadband_long %>%
left_join(
gdp_data_no_really_it_is,
by = c("iso3c", "year")
)
head(GDP_long)# A tibble: 6 × 6
country year broadband_per_100 iso3c gdp_per_capita region
<chr> <dbl> <dbl> <chr> <dbl> <chr>
1 Afghanistan 1998 NA AFG NA Middle East, North A…
2 Afghanistan 1999 NA AFG NA Middle East, North A…
3 Afghanistan 2000 NA AFG 308. Middle East, North A…
4 Afghanistan 2001 0 AFG 277. Middle East, North A…
5 Afghanistan 2002 0 AFG 338. Middle East, North A…
6 Afghanistan 2003 0 AFG 346. Middle East, North A…
1.2 Part 2: Gapminder
Recall the Task Menu:
- Get the maximum and minimum of GDP per capita for all continents.
GDP_long %>%
group_by(region) %>%
summarise(
min_gdp = min(gdp_per_capita, na.rm = TRUE),
max_gdp = max(gdp_per_capita, na.rm = TRUE)
)Warning: There were 2 warnings in `summarise()`.
The first warning was:
ℹ In argument: `min_gdp = min(gdp_per_capita, na.rm = TRUE)`.
ℹ In group 8: `region = NA`.
Caused by warning in `min()`:
! no non-missing arguments to min; returning Inf
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# A tibble: 8 × 3
region min_gdp max_gdp
<chr> <dbl> <dbl>
1 East Asia & Pacific 242. 88169.
2 Europe & Central Asia 389. 163244.
3 Latin America & Caribbean 1287. 96203.
4 Middle East, North Africa, Afghanistan & Pakistan 277. 81608.
5 North America 34895. 132595.
6 South Asia 512. 9396.
7 Sub-Saharan Africa 220. 13844.
8 <NA> Inf -Inf
- Look at the spread of GDP per capita across countries within the continents.
ggplot(
GDP_long %>% filter(region != "Aggregates"),
aes(x = gdp_per_capita, fill = region)
) +
geom_density(alpha = 0.5) +
scale_x_log10()+
scale_fill_viridis_d()Warning: Removed 150 rows containing non-finite outside the scale range
(`stat_density()`).
- How does life expectancy vary across different continents?
life_exp_data <- WDI(
country = "all",
indicator = c(life_expectancy = "SP.DYN.LE00.IN"),
start = 1998,
end = 2011,
extra = TRUE
) %>%
select(country, iso3c, year, region, life_expectancy)
life_exp_data %>% filter(region != "Aggregates") %>%
group_by(region) %>%
summarise (
mean_life_expect = mean(life_expectancy, na.rm = TRUE),
median_life_expect = median(life_expectancy, na.rm = TRUE),
min_life_expect = median(life_expectancy, na.rm = TRUE),
max_life_expect = max(life_expectancy, na.rm = TRUE)
)# A tibble: 7 × 5
region mean_life_expect median_life_expect min_life_expect max_life_expect
<chr> <dbl> <dbl> <dbl> <dbl>
1 East Asia… 70.6 70.4 70.4 83.4
2 Europe & … 75.3 76.1 76.1 85.0
3 Latin Ame… 72.3 72.9 72.9 79.6
4 Middle Ea… 71.6 72.8 72.8 81.9
5 North Ame… 78.8 78.8 78.8 81.5
6 South Asia 67.6 66.6 66.6 77.1
7 Sub-Sahar… 55.6 55.3 55.3 73.3
- Report the absolute and/or relative abundance of countries with low life expectancy over time by continent: Compute some measure of worldwide life expectancy - you decide - a mean or median, or some other quantile, or perhaps simply the life expectancy at your current age. Then determine how many countries on each continent have a life expectancy less than this benchmark, for each year.
#what is the story i am trying to tell? The country
life_exp_clean <- life_exp_data %>%
filter(region != "Aggregates")
benchmark <- mean(
life_exp_clean$life_expectancy,
na.rm = TRUE
)
benchmark[1] 69.00018
library(dplyr)
library(ggplot2)
life_exp_summary <- life_exp_clean %>%
filter(region != "Aggregates") %>%
mutate(below_benchmark = life_expectancy < benchmark) %>%
group_by(region, year) %>%
summarise(
countries_below_benchmark = sum(below_benchmark, na.rm = TRUE),
total_countries = n_distinct(country),
proportion_below_benchmark = countries_below_benchmark / total_countries,
.groups = "drop"
)
okabe_ito <- c("#000000", "#E69F00", "#56B4E9" ,"#009E73", "#F0E442", "#0072B2",
"#D55E00", "#CC79A7", "#999999")
ggplot(
life_exp_summary,
aes(x = year, y = countries_below_benchmark, color = region)
) +
geom_line(linewidth = 1.2) +
geom_point() +
scale_color_manual(values = okabe_ito) +
labs(
title = "Number of Countries Below Life Expectancy Benchmark Over Time",
x = "Year",
y = "Number of Countries Below Benchmark",
color = "Continent / Region"
) +
theme_minimal()- Make up your own!
1.2.1 Life Expectancy by Country, 1998–2011
1.2.1.1 Table
life_exp_wide <- life_exp_clean %>%
select(country, year, life_expectancy) %>%
mutate(life_expectancy = round(life_expectancy, 1)) %>%
pivot_wider(
names_from = year,
values_from = life_expectancy
) %>%
select(country, `1998`:`2011`) %>%
arrange(country)
life_exp_wide# A tibble: 217 × 15
country `1998` `1999` `2000` `2001` `2002` `2003` `2004` `2005` `2006` `2007`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Afghan… 52.5 54.5 55 55.5 56.2 57.2 57.8 58.2 58.6 59
2 Albania 74.4 74.6 74.8 75.1 75.3 75.6 76 76.4 77 77.7
3 Algeria 69.3 70.1 70.6 71 71.6 71.9 72.5 72.8 73.1 73.3
4 Americ… 71.5 71.5 71.4 71.6 71.8 71.7 72.3 72.4 72.4 72.4
5 Andorra 81.3 81.6 81.9 82.2 82.4 82.7 82.8 83.1 83.4 83.6
6 Angola 45.5 45.8 46.5 47 47.9 50.2 51.1 52.1 53 54.2
7 Antigu… 74.2 74.5 74.8 75.1 75.2 75.3 75.3 75.4 75.5 75.7
8 Argent… 73.4 73.7 73.9 74.2 74.3 74.3 74.9 75.2 75.3 74.8
9 Armenia 69.8 70.2 72.9 73 72.8 72.8 73 73.1 72.8 72.9
10 Aruba 72.9 72.9 72.9 73 73.1 73.2 73.2 73.4 73.5 73.6
# ℹ 207 more rows
# ℹ 4 more variables: `2008` <dbl>, `2009` <dbl>, `2010` <dbl>, `2011` <dbl>
1.2.1.2 Companion Graph
# I wanted to do something fun with this, so I went on a YouTube deep dive, and found this video to be very helpful : https://www.youtube.com/watch?v=qrW2y5sQFBg and this video as well https://www.youtube.com/watch?v=F0ZRYo4SUb8 and this one too: https://www.youtube.com/watch?v=RrtqBYLf404
install.packages(c("sf", "rnaturalearth", "rnaturalearthdata", "viridis"))Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)
install.packages("plotly")Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library(dplyr)plot_ly(
data = life_exp_clean,
type = "choropleth",
locations = ~iso3c,
z = ~life_expectancy,
text = ~paste(
country,
"<br>Year:", year,
"<br>Life expectancy:", round(life_expectancy, 1)
),
frame = ~year,
colorscale = "Viridis",
colorbar = list(title = "Life expectancy")
) %>%
layout(
title = "Life Expectancy by Country Over Time",
geo = list(
projection = list(type = "natural earth"),
showframe = FALSE,
showcoastlines = TRUE
)
)# life_exp_clean %>%
# summarise(
# q1 = quantile(life_expectancy, 0.25, na.rm = TRUE),
# median = median(life_expectancy, na.rm = TRUE),
# q3 = quantile(life_expectancy, 0.75, na.rm = TRUE),
# IQR = IQR(life_expectancy, na.rm = TRUE)
# ) I wanted to use the Q1 as the lower cut off... but it flattened my graphic and lost a lot of the nuance that was previously present.
# ggplot(
# life_exp_clean,
# aes(x = life_expectancy)
# ) +
# geom_histogram(
# bins = 30,
# fill = "lightseagreen",
# color = "white"
# ) +
# labs(
# title = "Distribution of Life Expectancy Worldwide",
# x = "Life Expectancy",
# y = "Count"
# ) +
# theme_minimal() I found that creating a histogram was more useful in allowing me to see what i should use as the z min cutoff...
plot_ly(
data = life_exp_clean,
type = "choropleth",
locations = ~iso3c,
z = ~life_expectancy,
zmin = 50,
zmax = max(life_exp_clean$life_expectancy),
text = ~paste(
country,
"<br>Year:", year,
"<br>Life expectancy:", round(life_expectancy, 1)
),
frame = ~year,
colorscale = "Viridis",
colorbar = list(title = "Life expectancy")
) %>%
layout(
title = "Life Expectancy by Country Over Time",
geo = list(
projection = list(type = "natural earth"),
showframe = FALSE,
showcoastlines = TRUE
)
)1.2.1.3 Description/Writeup
I sat and looked at the prompt of “Report the absolute and/or relative abundance of countries with low life expectancy over time by continent” for a while, even reading it out loud to see if I can get a sense of what I’d like to see from a visual in order to answer the question. I realized that I’m a visual learner (like many folks here) and to me, it would be most helpful to have a map-element rather than bars or lines over time laying flatly on the axis. I went back and forth between the standardizing the z element which controls the life expectancy… because as you can see with the first iteration, 1998 was a difficult year for South Sudan (I did Google this, there was a humanitarian crisis, famine, and civil war here at that time). But the outlier being so significant caused my axes to change from year to year which interfered with continuity. I ran some additional stats and visuals to decide how to to standardize my axis, and encoded that into my z-variable. I also love the hover feature, and how you can add additional, site-specific information to the graph without compromising its overall readability.
1.2.2 Changes to Life Expectancies Over Time
1.2.2.1 Table
# life_exp_clean, plan: 1. add column to see biggest positive or negative changes to each country over time.
life_exp_changes <- life_exp_clean %>%
arrange(country, year) %>%
group_by(country) %>%
mutate(
life_exp_change = life_expectancy - lag(life_expectancy),
life_exp_change_label = ifelse(
is.na(life_exp_change),
NA,
sprintf("%+.2f", life_exp_change)
)
) %>%
ungroup()
life_exp_changes# A tibble: 3,038 × 7
country iso3c year region life_expectancy life_exp_change
<chr> <chr> <int> <chr> <dbl> <dbl>
1 Afghanistan AFG 1998 Middle East, North A… 52.5 NA
2 Afghanistan AFG 1999 Middle East, North A… 54.5 2.04
3 Afghanistan AFG 2000 Middle East, North A… 55.0 0.473
4 Afghanistan AFG 2001 Middle East, North A… 55.5 0.506
5 Afghanistan AFG 2002 Middle East, North A… 56.2 0.714
6 Afghanistan AFG 2003 Middle East, North A… 57.2 0.946
7 Afghanistan AFG 2004 Middle East, North A… 57.8 0.639
8 Afghanistan AFG 2005 Middle East, North A… 58.2 0.437
9 Afghanistan AFG 2006 Middle East, North A… 58.6 0.306
10 Afghanistan AFG 2007 Middle East, North A… 59.0 0.403
# ℹ 3,028 more rows
# ℹ 1 more variable: life_exp_change_label <chr>
life_exp_changes_wide <- life_exp_changes %>%
select(country, year, life_exp_change_label) %>%
pivot_wider(
names_from = year,
values_from = life_exp_change_label,
names_prefix = "change_"
) %>%
arrange(country)
life_exp_changes_wide # need to add "change total from 1998-2011" but also drop change_1998 bc that's n=0# A tibble: 217 × 15
country change_1998 change_1999 change_2000 change_2001 change_2002
<chr> <chr> <chr> <chr> <chr> <chr>
1 Afghanistan <NA> +2.04 +0.47 +0.51 +0.71
2 Albania <NA> +0.21 +0.26 +0.26 +0.22
3 Algeria <NA> +0.76 +0.49 +0.45 +0.62
4 American Samoa <NA> -0.06 -0.04 +0.13 +0.18
5 Andorra <NA> +0.27 +0.31 +0.30 +0.29
6 Angola <NA> +0.36 +0.69 +0.53 +0.84
7 Antigua and Barb… <NA> +0.27 +0.36 +0.29 +0.13
8 Argentina <NA> +0.27 +0.23 +0.24 +0.16
9 Armenia <NA> +0.40 +2.68 +0.10 -0.15
10 Aruba <NA> -0.08 +0.08 +0.11 +0.09
# ℹ 207 more rows
# ℹ 9 more variables: change_2003 <chr>, change_2004 <chr>, change_2005 <chr>,
# change_2006 <chr>, change_2007 <chr>, change_2008 <chr>, change_2009 <chr>,
# change_2010 <chr>, change_2011 <chr>
life_exp_changes_wide <- life_exp_changes_wide %>% left_join(life_exp_clean %>% group_by(country) %>% summarise(total_change_1998_2011 = sprintf("%+.2f", life_expectancy[year == 2011] - life_expectancy[year == 1998])), by = "country")
life_exp_changes_wide <- life_exp_changes_wide %>%
select(-change_1998)
life_exp_changes_wide_clean <- life_exp_changes_wide %>% clean_names() %>% rename(Country = country, `Total Change (1998–2011)` = total_change_1998_2011) %>% relocate(Country, `Total Change (1998–2011)`)
life_exp_changes_wide_clean# A tibble: 217 × 15
Country Total Change (1998–2…¹ change_1999 change_2000 change_2001
<chr> <chr> <chr> <chr> <chr>
1 Afghanistan +8.76 +2.04 +0.47 +0.51
2 Albania +3.95 +0.21 +0.26 +0.26
3 Algeria +5.06 +0.76 +0.49 +0.45
4 American Samoa +1.12 -0.06 -0.04 +0.13
5 Andorra +3.02 +0.27 +0.31 +0.30
6 Angola +12.64 +0.36 +0.69 +0.53
7 Antigua and Barbu… +2.84 +0.27 +0.36 +0.29
8 Argentina +2.69 +0.27 +0.23 +0.24
9 Armenia +4.02 +0.40 +2.68 +0.10
10 Aruba +1.64 -0.08 +0.08 +0.11
# ℹ 207 more rows
# ℹ abbreviated name: ¹`Total Change (1998–2011)`
# ℹ 10 more variables: change_2002 <chr>, change_2003 <chr>, change_2004 <chr>,
# change_2005 <chr>, change_2006 <chr>, change_2007 <chr>, change_2008 <chr>,
# change_2009 <chr>, change_2010 <chr>, change_2011 <chr>
1.2.2.2 Companion Graph
library(dplyr)
library(plotly)
life_exp_changes <- life_exp_clean %>%
arrange(country, year) %>%
group_by(country) %>%
mutate(
life_exp_change = life_expectancy - lag(life_expectancy)
) %>%
ungroup()
# Total change from 1998 → 2011
total_change <- life_exp_clean %>%
filter(year %in% c(1998, 2011)) %>%
select(country, iso3c, year, life_expectancy) %>%
pivot_wider(
names_from = year,
values_from = life_expectancy
) %>%
mutate(
life_exp_change = `2011` - `1998`,
year = "1998–2011 Total Change"
) %>%
select(country, iso3c, year, life_exp_change)
plot_data <- bind_rows(
life_exp_changes %>%
filter(!is.na(life_exp_change)) %>%
select(country, iso3c, year, life_exp_change) %>%
mutate(year = as.character(year)),
total_change %>%
mutate(year = as.character(year)))
plot_ly(
data = plot_data,
type = "choropleth",
locations = ~iso3c,
z = ~life_exp_change,
text = ~paste(
country,
"<br>Frame:", year,
"<br>Change:", sprintf("%+.2f", life_exp_change)
),
frame = ~year,
colorscale = "RdBu",
reversescale = TRUE,
colorbar = list(
title = "Change in years"
)
) %>%
layout(
title = "Changes in Life Expectancy Over Time",
geo = list(
projection = list(type = "natural earth"),
showframe = FALSE,
showcoastlines = TRUE
)
)# again, I ran into a similar problem of needing to standardize my scales, I think this works well:
plot_data <- bind_rows(
life_exp_changes %>%
filter(!is.na(life_exp_change)) %>%
select(country, iso3c, year, life_exp_change) %>%
mutate(year = as.character(year)))
plot_ly(
data = plot_data,
type = "choropleth",
locations = ~iso3c,
z = ~life_exp_change,
zmin = -5,
zmax = 5,
text = ~paste(
country,
"<br>Frame:", year,
"<br>Change:", sprintf("%+.2f", life_exp_change)
),
frame = ~year,
colorscale = list(
c(0.00, "#B2182B"), # negative = red
c(0.49, "#F4A582"), # negative side ends
c(0.50, "#D9D9D9"), # zero = gray
c(0.51, "#92C5DE"), # positive side starts blue
c(1.00, "#2166AC") # positive = blue
),
colorbar = list(
title = "Change in years"
)
) %>%
layout(
title = "Changes in Life Expectancy Over Time",
geo = list(
projection = list(type = "natural earth"),
showframe = FALSE,
showcoastlines = TRUE
)
)1.2.2.3 Description/Writeup
Here, I had the same issue of the “wandering color scale” which I first standardized and then I hard-coded the colors to be reflective of growth vs. decline. I think this helped me understand and visualize which countries are growing in life expectancy and which are seeing life expectancy decline. I think it would be cool to add in a “Total Growth” tab, but I’m not quite sure how it would be best to do that, or if it’s possible to add it to the timeline.