lab04

Author

Anais

Published

May 4, 2026

#install.packages("Hmisc")
#install.packages("ggbeeswarm")
#install.packages("skimr")
#install.packages("janitor")
#install.packages("ggdist")

1 Lab 4 Deliverable

library(readxl)
GDP_data <- read_excel("data/gapminder_broadband_per_100.xlsx")
head(GDP_data)

# A tibble: 6 × 15
  Fixed broadband Internet…¹ `1998` `1999` `2000` `2001` `2002`  `2003`   `2004`
  <chr>                       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>    <dbl>
1 Afghanistan                    NA     NA     NA      0   0     0       6.88e-4
2 Albania                        NA     NA     NA      0   0     0       0      
3 Algeria                        NA     NA     NA      0   0     0.0564  1.11e-1
4 American Samoa                 NA     NA     NA      0   0    NA      NA      
5 Andorra                        NA     NA     NA     NA   1.66  4.99    8.34e+0
6 Angola                         NA     NA     NA      0   0     0       0      
# ℹ abbreviated name: ¹`Fixed broadband Internet subscribers (per 100 people)`
# ℹ 7 more variables: `2005` <dbl>, `2006` <dbl>, `2007` <dbl>, `2008` <dbl>,
#   `2009` <dbl>, `2010` <dbl>, `2011` <lgl>

install.packages("WDI")

Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)

install.packages("countrycode")

Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)

1.1 Part 1: Tidying

library(readxl)
library(WDI)
library(countrycode)
library(dplyr)
library(tidyr)
library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

GDP_data <- read_excel("data/gapminder_broadband_per_100.xlsx")

glimpse(GDP_data)

Rows: 213
Columns: 15
$ `Fixed broadband Internet subscribers (per 100 people)` <chr> "Afghanistan",…
$ `1998`                                                  <dbl> NA, NA, NA, NA…
$ `1999`                                                  <dbl> NA, NA, NA, NA…
$ `2000`                                                  <dbl> NA, NA, NA, NA…
$ `2001`                                                  <dbl> 0.0000000000, …
$ `2002`                                                  <dbl> 0.0000000000, …
$ `2003`                                                  <dbl> 0.000000e+00, …
$ `2004`                                                  <dbl> 6.880265e-04, …
$ `2005`                                                  <dbl> 7.356639e-04, …
$ `2006`                                                  <dbl> 0.001625928, N…
$ `2007`                                                  <dbl> 0.001581161, 0…
$ `2008`                                                  <dbl> 0.001537626, 2…
$ `2009`                                                  <dbl> 0.00299058, 2.…
$ `2010`                                                  <dbl> 0.004362367, 3…
$ `2011`                                                  <lgl> NA, NA, NA, NA…

gdp_data_no_really_it_is <- WDI(
  country = "all",
  indicator = c(gdp_per_capita = "NY.GDP.PCAP.KD"),
  start = 1998,
  end = 2011,
  extra = TRUE
) %>%
  select(iso3c, year, gdp_per_capita, region)

head(gdp_data_no_really_it_is)

  iso3c year gdp_per_capita                                            region
1   AFG 2011       525.4270 Middle East, North Africa, Afghanistan & Pakistan
2   AFG 2010       542.8710 Middle East, North Africa, Afghanistan & Pakistan
3   AFG 2009       488.8307 Middle East, North Africa, Afghanistan & Pakistan
4   AFG 2008       417.6473 Middle East, North Africa, Afghanistan & Pakistan
5   AFG 2007       410.7577 Middle East, North Africa, Afghanistan & Pakistan
6   AFG 2006       367.7583 Middle East, North Africa, Afghanistan & Pakistan

broadband_long <- GDP_data %>%
  clean_names() %>%
  rename(country = fixed_broadband_internet_subscribers_per_100_people) %>%
  pivot_longer(
    cols = matches("^x?\\d{4}$"),
    names_to = "year",
    values_to = "broadband_per_100"
  ) %>%
  mutate(
    year = as.numeric(gsub("^x", "", year)),
    broadband_per_100 = as.numeric(broadband_per_100),
    iso3c = countrycode(
      country,
      "country.name",
      "iso3c",
      custom_match = c(
        "Channel Islands" = "CHI",
        "Kosovo" = "XKX",
        "Saint Martin" = "MAF"
      )
    )
  )
# change iso3c = 3-letter country code, to match the names of the country as they are listed in the broadband_internet


head(broadband_long)

# A tibble: 6 × 4
  country      year broadband_per_100 iso3c
  <chr>       <dbl>             <dbl> <chr>
1 Afghanistan  1998                NA AFG  
2 Afghanistan  1999                NA AFG  
3 Afghanistan  2000                NA AFG  
4 Afghanistan  2001                 0 AFG  
5 Afghanistan  2002                 0 AFG  
6 Afghanistan  2003                 0 AFG

GDP_long <- broadband_long %>%
  left_join(
    gdp_data_no_really_it_is,
    by = c("iso3c", "year")
  )

head(GDP_long)

# A tibble: 6 × 6
  country      year broadband_per_100 iso3c gdp_per_capita region               
  <chr>       <dbl>             <dbl> <chr>          <dbl> <chr>                
1 Afghanistan  1998                NA AFG              NA  Middle East, North A…
2 Afghanistan  1999                NA AFG              NA  Middle East, North A…
3 Afghanistan  2000                NA AFG             308. Middle East, North A…
4 Afghanistan  2001                 0 AFG             277. Middle East, North A…
5 Afghanistan  2002                 0 AFG             338. Middle East, North A…
6 Afghanistan  2003                 0 AFG             346. Middle East, North A…

1.2 Part 2: Gapminder

Recall the Task Menu:

Get the maximum and minimum of GDP per capita for all continents.

GDP_long %>%
  group_by(region) %>%
  summarise(
    min_gdp = min(gdp_per_capita, na.rm = TRUE),
    max_gdp = max(gdp_per_capita, na.rm = TRUE)
  )

Warning: There were 2 warnings in `summarise()`.
The first warning was:
ℹ In argument: `min_gdp = min(gdp_per_capita, na.rm = TRUE)`.
ℹ In group 8: `region = NA`.
Caused by warning in `min()`:
! no non-missing arguments to min; returning Inf
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

# A tibble: 8 × 3
  region                                            min_gdp max_gdp
  <chr>                                               <dbl>   <dbl>
1 East Asia & Pacific                                  242.  88169.
2 Europe & Central Asia                                389. 163244.
3 Latin America & Caribbean                           1287.  96203.
4 Middle East, North Africa, Afghanistan & Pakistan    277.  81608.
5 North America                                      34895. 132595.
6 South Asia                                           512.   9396.
7 Sub-Saharan Africa                                   220.  13844.
8 <NA>                                                 Inf    -Inf

Look at the spread of GDP per capita across countries within the continents.

ggplot(
  GDP_long %>% filter(region != "Aggregates"),
  aes(x = gdp_per_capita, fill = region)
) + 
  geom_density(alpha = 0.5) +
  scale_x_log10()+
  scale_fill_viridis_d()

Warning: Removed 150 rows containing non-finite outside the scale range
(`stat_density()`).

How does life expectancy vary across different continents?

life_exp_data <- WDI(
  country = "all",
  indicator = c(life_expectancy = "SP.DYN.LE00.IN"),
  start = 1998,
  end = 2011, 
  extra = TRUE
) %>% 
  select(country, iso3c, year, region, life_expectancy)

life_exp_data %>% filter(region != "Aggregates") %>% 
  group_by(region) %>% 
  summarise (
    mean_life_expect = mean(life_expectancy, na.rm = TRUE), 
             median_life_expect = median(life_expectancy, na.rm = TRUE), 
    min_life_expect = median(life_expectancy, na.rm = TRUE), 
    max_life_expect = max(life_expectancy, na.rm = TRUE)
    
    )

# A tibble: 7 × 5
  region     mean_life_expect median_life_expect min_life_expect max_life_expect
  <chr>                 <dbl>              <dbl>           <dbl>           <dbl>
1 East Asia…             70.6               70.4            70.4            83.4
2 Europe & …             75.3               76.1            76.1            85.0
3 Latin Ame…             72.3               72.9            72.9            79.6
4 Middle Ea…             71.6               72.8            72.8            81.9
5 North Ame…             78.8               78.8            78.8            81.5
6 South Asia             67.6               66.6            66.6            77.1
7 Sub-Sahar…             55.6               55.3            55.3            73.3

Report the absolute and/or relative abundance of countries with low life expectancy over time by continent: Compute some measure of worldwide life expectancy - you decide - a mean or median, or some other quantile, or perhaps simply the life expectancy at your current age. Then determine how many countries on each continent have a life expectancy less than this benchmark, for each year.

#what is the story i am trying to tell? The country
life_exp_clean <- life_exp_data %>%
  filter(region != "Aggregates")

benchmark <- mean(
  life_exp_clean$life_expectancy,
  na.rm = TRUE
)

benchmark

[1] 69.00018

library(dplyr)
library(ggplot2)

life_exp_summary <- life_exp_clean %>%
  filter(region != "Aggregates") %>%
  mutate(below_benchmark = life_expectancy < benchmark) %>%
  group_by(region, year) %>%
  summarise(
    countries_below_benchmark = sum(below_benchmark, na.rm = TRUE),
    total_countries = n_distinct(country),
    proportion_below_benchmark = countries_below_benchmark / total_countries,
    .groups = "drop"
  )
okabe_ito <- c("#000000", "#E69F00", "#56B4E9" ,"#009E73", "#F0E442", "#0072B2",
 "#D55E00", "#CC79A7", "#999999")

ggplot(
  life_exp_summary,
  aes(x = year, y = countries_below_benchmark, color = region)
) +
  geom_line(linewidth = 1.2) +
  geom_point() +
  scale_color_manual(values = okabe_ito) + 
  labs(
    title = "Number of Countries Below Life Expectancy Benchmark Over Time",
    x = "Year",
    y = "Number of Countries Below Benchmark",
    color = "Continent / Region"
  ) +
  theme_minimal()

Make up your own!

1.2.1 Life Expectancy by Country, 1998–2011

1.2.1.1 Table

life_exp_wide <- life_exp_clean %>%
  select(country, year, life_expectancy) %>%
  mutate(life_expectancy = round(life_expectancy, 1)) %>%
  pivot_wider(
    names_from = year,
    values_from = life_expectancy
  ) %>%
  select(country, `1998`:`2011`) %>%
  arrange(country)

life_exp_wide

# A tibble: 217 × 15
   country `1998` `1999` `2000` `2001` `2002` `2003` `2004` `2005` `2006` `2007`
   <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 Afghan…   52.5   54.5   55     55.5   56.2   57.2   57.8   58.2   58.6   59  
 2 Albania   74.4   74.6   74.8   75.1   75.3   75.6   76     76.4   77     77.7
 3 Algeria   69.3   70.1   70.6   71     71.6   71.9   72.5   72.8   73.1   73.3
 4 Americ…   71.5   71.5   71.4   71.6   71.8   71.7   72.3   72.4   72.4   72.4
 5 Andorra   81.3   81.6   81.9   82.2   82.4   82.7   82.8   83.1   83.4   83.6
 6 Angola    45.5   45.8   46.5   47     47.9   50.2   51.1   52.1   53     54.2
 7 Antigu…   74.2   74.5   74.8   75.1   75.2   75.3   75.3   75.4   75.5   75.7
 8 Argent…   73.4   73.7   73.9   74.2   74.3   74.3   74.9   75.2   75.3   74.8
 9 Armenia   69.8   70.2   72.9   73     72.8   72.8   73     73.1   72.8   72.9
10 Aruba     72.9   72.9   72.9   73     73.1   73.2   73.2   73.4   73.5   73.6
# ℹ 207 more rows
# ℹ 4 more variables: `2008` <dbl>, `2009` <dbl>, `2010` <dbl>, `2011` <dbl>

1.2.1.2 Companion Graph

# I wanted to do something fun with this, so I went on a YouTube deep dive, and found this video to be very helpful : https://www.youtube.com/watch?v=qrW2y5sQFBg and this video as well https://www.youtube.com/watch?v=F0ZRYo4SUb8 and this one too: https://www.youtube.com/watch?v=RrtqBYLf404

install.packages(c("sf", "rnaturalearth", "rnaturalearthdata", "viridis"))

Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)

install.packages("plotly")

Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)

library(plotly)


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

library(dplyr)

plot_ly(
  data = life_exp_clean,
  type = "choropleth",
  locations = ~iso3c,
  z = ~life_expectancy,
  text = ~paste(
    country,
    "<br>Year:", year,
    "<br>Life expectancy:", round(life_expectancy, 1)
  ),
  frame = ~year,
  colorscale = "Viridis",
  colorbar = list(title = "Life expectancy")
) %>%
  layout(
    title = "Life Expectancy by Country Over Time",
    geo = list(
      projection = list(type = "natural earth"),
      showframe = FALSE,
      showcoastlines = TRUE
    )
  )

# life_exp_clean %>%
#   summarise(
#     q1 = quantile(life_expectancy, 0.25, na.rm = TRUE),
#     median = median(life_expectancy, na.rm = TRUE),
#     q3 = quantile(life_expectancy, 0.75, na.rm = TRUE),
#     IQR = IQR(life_expectancy, na.rm = TRUE)
#   )  I wanted to use the Q1 as the lower cut off... but it flattened my graphic and lost a lot of the nuance that was previously present. 

# ggplot(
#   life_exp_clean,
#   aes(x = life_expectancy)
# ) +
#   geom_histogram(
#     bins = 30,
#     fill = "lightseagreen",
#     color = "white"
#   ) +
#   labs(
#     title = "Distribution of Life Expectancy Worldwide",
#     x = "Life Expectancy",
#     y = "Count"
#   ) +
#   theme_minimal() I found that creating a histogram was more useful in allowing me to see what i should use as the z min cutoff... 


plot_ly(
  data = life_exp_clean,
  type = "choropleth",
  locations = ~iso3c,
  z = ~life_expectancy,
  zmin = 50, 
  zmax = max(life_exp_clean$life_expectancy),
  text = ~paste(
    country,
    "<br>Year:", year,
    "<br>Life expectancy:", round(life_expectancy, 1)
  ),
  frame = ~year,
  colorscale = "Viridis",
  colorbar = list(title = "Life expectancy")
) %>%
  layout(
    title = "Life Expectancy by Country Over Time",
    geo = list(
      projection = list(type = "natural earth"),
      showframe = FALSE,
      showcoastlines = TRUE
    )
  )

1.2.1.3 Description/Writeup

I sat and looked at the prompt of “Report the absolute and/or relative abundance of countries with low life expectancy over time by continent” for a while, even reading it out loud to see if I can get a sense of what I’d like to see from a visual in order to answer the question. I realized that I’m a visual learner (like many folks here) and to me, it would be most helpful to have a map-element rather than bars or lines over time laying flatly on the axis. I went back and forth between the standardizing the z element which controls the life expectancy… because as you can see with the first iteration, 1998 was a difficult year for South Sudan (I did Google this, there was a humanitarian crisis, famine, and civil war here at that time). But the outlier being so significant caused my axes to change from year to year which interfered with continuity. I ran some additional stats and visuals to decide how to to standardize my axis, and encoded that into my z-variable. I also love the hover feature, and how you can add additional, site-specific information to the graph without compromising its overall readability.

1.2.2 Changes to Life Expectancies Over Time

1.2.2.1 Table

# life_exp_clean, plan: 1. add column to see biggest positive or negative changes to each country over time.

life_exp_changes <- life_exp_clean %>%
  arrange(country, year) %>%
  group_by(country) %>%
  mutate(
    life_exp_change = life_expectancy - lag(life_expectancy),

    life_exp_change_label = ifelse(
      is.na(life_exp_change),
      NA,
      sprintf("%+.2f", life_exp_change)
    )
  ) %>%
  ungroup()

life_exp_changes

# A tibble: 3,038 × 7
   country     iso3c  year region                life_expectancy life_exp_change
   <chr>       <chr> <int> <chr>                           <dbl>           <dbl>
 1 Afghanistan AFG    1998 Middle East, North A…            52.5          NA    
 2 Afghanistan AFG    1999 Middle East, North A…            54.5           2.04 
 3 Afghanistan AFG    2000 Middle East, North A…            55.0           0.473
 4 Afghanistan AFG    2001 Middle East, North A…            55.5           0.506
 5 Afghanistan AFG    2002 Middle East, North A…            56.2           0.714
 6 Afghanistan AFG    2003 Middle East, North A…            57.2           0.946
 7 Afghanistan AFG    2004 Middle East, North A…            57.8           0.639
 8 Afghanistan AFG    2005 Middle East, North A…            58.2           0.437
 9 Afghanistan AFG    2006 Middle East, North A…            58.6           0.306
10 Afghanistan AFG    2007 Middle East, North A…            59.0           0.403
# ℹ 3,028 more rows
# ℹ 1 more variable: life_exp_change_label <chr>

life_exp_changes_wide <- life_exp_changes %>%
  select(country, year, life_exp_change_label) %>%
  pivot_wider(
    names_from = year,
    values_from = life_exp_change_label,
    names_prefix = "change_"
  ) %>%
  arrange(country)

life_exp_changes_wide # need to add "change total from 1998-2011" but also drop change_1998 bc that's n=0

# A tibble: 217 × 15
   country           change_1998 change_1999 change_2000 change_2001 change_2002
   <chr>             <chr>       <chr>       <chr>       <chr>       <chr>      
 1 Afghanistan       <NA>        +2.04       +0.47       +0.51       +0.71      
 2 Albania           <NA>        +0.21       +0.26       +0.26       +0.22      
 3 Algeria           <NA>        +0.76       +0.49       +0.45       +0.62      
 4 American Samoa    <NA>        -0.06       -0.04       +0.13       +0.18      
 5 Andorra           <NA>        +0.27       +0.31       +0.30       +0.29      
 6 Angola            <NA>        +0.36       +0.69       +0.53       +0.84      
 7 Antigua and Barb… <NA>        +0.27       +0.36       +0.29       +0.13      
 8 Argentina         <NA>        +0.27       +0.23       +0.24       +0.16      
 9 Armenia           <NA>        +0.40       +2.68       +0.10       -0.15      
10 Aruba             <NA>        -0.08       +0.08       +0.11       +0.09      
# ℹ 207 more rows
# ℹ 9 more variables: change_2003 <chr>, change_2004 <chr>, change_2005 <chr>,
#   change_2006 <chr>, change_2007 <chr>, change_2008 <chr>, change_2009 <chr>,
#   change_2010 <chr>, change_2011 <chr>

life_exp_changes_wide <- life_exp_changes_wide %>% left_join(life_exp_clean %>% group_by(country) %>% summarise(total_change_1998_2011 = sprintf("%+.2f", life_expectancy[year == 2011] - life_expectancy[year == 1998])), by = "country")

life_exp_changes_wide <- life_exp_changes_wide %>%
  select(-change_1998)

life_exp_changes_wide_clean <- life_exp_changes_wide %>% clean_names() %>% rename(Country = country, `Total Change (1998–2011)` = total_change_1998_2011) %>% relocate(Country, `Total Change (1998–2011)`)

life_exp_changes_wide_clean

# A tibble: 217 × 15
   Country            Total Change (1998–2…¹ change_1999 change_2000 change_2001
   <chr>              <chr>                  <chr>       <chr>       <chr>      
 1 Afghanistan        +8.76                  +2.04       +0.47       +0.51      
 2 Albania            +3.95                  +0.21       +0.26       +0.26      
 3 Algeria            +5.06                  +0.76       +0.49       +0.45      
 4 American Samoa     +1.12                  -0.06       -0.04       +0.13      
 5 Andorra            +3.02                  +0.27       +0.31       +0.30      
 6 Angola             +12.64                 +0.36       +0.69       +0.53      
 7 Antigua and Barbu… +2.84                  +0.27       +0.36       +0.29      
 8 Argentina          +2.69                  +0.27       +0.23       +0.24      
 9 Armenia            +4.02                  +0.40       +2.68       +0.10      
10 Aruba              +1.64                  -0.08       +0.08       +0.11      
# ℹ 207 more rows
# ℹ abbreviated name: ¹`Total Change (1998–2011)`
# ℹ 10 more variables: change_2002 <chr>, change_2003 <chr>, change_2004 <chr>,
#   change_2005 <chr>, change_2006 <chr>, change_2007 <chr>, change_2008 <chr>,
#   change_2009 <chr>, change_2010 <chr>, change_2011 <chr>

1.2.2.2 Companion Graph

library(dplyr)
library(plotly)

life_exp_changes <- life_exp_clean %>%
  arrange(country, year) %>%
  group_by(country) %>%
  mutate(
    life_exp_change = life_expectancy - lag(life_expectancy)
  ) %>%
  ungroup()

# Total change from 1998 → 2011

total_change <- life_exp_clean %>%
  filter(year %in% c(1998, 2011)) %>%
  select(country, iso3c, year, life_expectancy) %>%
  
  pivot_wider(
    names_from = year,
    values_from = life_expectancy
  ) %>%
  
  mutate(
    life_exp_change = `2011` - `1998`,
    year = "1998–2011 Total Change"
  ) %>%
  
  select(country, iso3c, year, life_exp_change)


plot_data <- bind_rows(
  life_exp_changes %>%
    filter(!is.na(life_exp_change)) %>%
    select(country, iso3c, year, life_exp_change) %>%
    mutate(year = as.character(year)),
  
  total_change %>%
    mutate(year = as.character(year)))


plot_ly(
  data = plot_data,
  type = "choropleth",
  locations = ~iso3c,
  z = ~life_exp_change,
  text = ~paste(
    country,
    "<br>Frame:", year,
    "<br>Change:", sprintf("%+.2f", life_exp_change)
  ),
  
  frame = ~year,
  
  colorscale = "RdBu",
  reversescale = TRUE,
  
  colorbar = list(
    title = "Change in years"
  )
) %>%
  layout(
    title = "Changes in Life Expectancy Over Time",
    
    geo = list(
      projection = list(type = "natural earth"),
      showframe = FALSE,
      showcoastlines = TRUE
    )
  )

# again, I ran into a similar problem of needing to standardize my scales, I think this works well: 
 plot_data <- bind_rows(
  life_exp_changes %>%
    filter(!is.na(life_exp_change)) %>%
    select(country, iso3c, year, life_exp_change) %>%
    mutate(year = as.character(year)))


plot_ly(
  data = plot_data,
  type = "choropleth",
  locations = ~iso3c,
  z = ~life_exp_change,
  zmin = -5,
  zmax = 5,
  text = ~paste(
    country,
    "<br>Frame:", year,
    "<br>Change:", sprintf("%+.2f", life_exp_change)
  ),
  
  frame = ~year,
  
  colorscale = list(
  c(0.00, "#B2182B"),  # negative = red
  c(0.49, "#F4A582"), # negative side ends
  c(0.50, "#D9D9D9"), # zero = gray
  c(0.51, "#92C5DE"), # positive side starts blue
  c(1.00, "#2166AC")   # positive = blue
),
  colorbar = list(
    title = "Change in years"
  )
) %>%
  layout(
    title = "Changes in Life Expectancy Over Time",
    
    geo = list(
      projection = list(type = "natural earth"),
      showframe = FALSE,
      showcoastlines = TRUE
    )
  )

1.2.2.3 Description/Writeup

Here, I had the same issue of the “wandering color scale” which I first standardized and then I hard-coded the colors to be reflective of growth vs. decline. I think this helped me understand and visualize which countries are growing in life expectancy and which are seeing life expectancy decline. I think it would be cool to add in a “Total Growth” tab, but I’m not quite sure how it would be best to do that, or if it’s possible to add it to the timeline.