Assignment 1

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.3     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load Data

gapminder_wide <- read_csv("gapminder_wide.csv")
Rows: 142 Columns: 26
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): country, continent
dbl (24): gdpPercap_1952, gdpPercap_1957, gdpPercap_1962, gdpPercap_1967, gd...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(gapminder_wide)
Rows: 142
Columns: 26
$ country        <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Argenti…
$ continent      <chr> "Asia", "Europe", "Africa", "Africa", "Americas", "Ocea…
$ gdpPercap_1952 <dbl> 779.4453, 1601.0561, 2449.0082, 3520.6103, 5911.3151, 1…
$ gdpPercap_1957 <dbl> 820.8530, 1942.2842, 3013.9760, 3827.9405, 6856.8562, 1…
$ gdpPercap_1962 <dbl> 853.1007, 2312.8890, 2550.8169, 4269.2767, 7133.1660, 1…
$ gdpPercap_1967 <dbl> 836.1971, 2760.1969, 3246.9918, 5522.7764, 8052.9530, 1…
$ gdpPercap_1972 <dbl> 739.9811, 3313.4222, 4182.6638, 5473.2880, 9443.0385, 1…
$ gdpPercap_1977 <dbl> 786.1134, 3533.0039, 4910.4168, 3008.6474, 10079.0267, …
$ gdpPercap_1982 <dbl> 978.0114, 3630.8807, 5745.1602, 2756.9537, 8997.8974, 1…
$ gdpPercap_1987 <dbl> 852.3959, 3738.9327, 5681.3585, 2430.2083, 9139.6714, 2…
$ gdpPercap_1992 <dbl> 649.3414, 2497.4379, 5023.2166, 2627.8457, 9308.4187, 2…
$ gdpPercap_1997 <dbl> 635.3414, 3193.0546, 4797.2951, 2277.1409, 10967.2820, …
$ gdpPercap_2002 <dbl> 726.7341, 4604.2117, 5288.0404, 2773.2873, 8797.6407, 3…
$ gdpPercap_2007 <dbl> 974.5803, 5937.0295, 6223.3675, 4797.2313, 12779.3796, …
$ lifeExp_1952   <dbl> 28.801, 55.230, 43.077, 30.015, 62.485, 69.120, 66.800,…
$ lifeExp_1957   <dbl> 30.33200, 59.28000, 45.68500, 31.99900, 64.39900, 70.33…
$ lifeExp_1962   <dbl> 31.99700, 64.82000, 48.30300, 34.00000, 65.14200, 70.93…
$ lifeExp_1967   <dbl> 34.02000, 66.22000, 51.40700, 35.98500, 65.63400, 71.10…
$ lifeExp_1972   <dbl> 36.08800, 67.69000, 54.51800, 37.92800, 67.06500, 71.93…
$ lifeExp_1977   <dbl> 38.43800, 68.93000, 58.01400, 39.48300, 68.48100, 73.49…
$ lifeExp_1982   <dbl> 39.854, 70.420, 61.368, 39.942, 69.942, 74.740, 73.180,…
$ lifeExp_1987   <dbl> 40.822, 72.000, 65.799, 39.906, 70.774, 76.320, 74.940,…
$ lifeExp_1992   <dbl> 41.674, 71.581, 67.744, 40.647, 71.868, 77.560, 76.040,…
$ lifeExp_1997   <dbl> 41.763, 72.950, 69.152, 40.963, 73.275, 78.830, 77.510,…
$ lifeExp_2002   <dbl> 42.129, 75.651, 70.994, 41.003, 74.340, 80.370, 78.980,…
$ lifeExp_2007   <dbl> 43.828, 76.423, 72.301, 42.731, 75.320, 81.235, 79.829,…

Answer: This dataset contains country-level GDP per capita and life expectancy data across multiple years. Each row represents a country and columns represent different years in wide format.

Data Tidying

gap_tidy <- gapminder_wide %>%
  pivot_longer(
    cols = -c(country, continent),
    names_to = c(".value", "year"),
    names_sep = "_"
  ) %>%
  mutate(year = as.numeric(year))

head(gap_tidy, 10)
# A tibble: 10 × 5
   country     continent  year gdpPercap lifeExp
   <chr>       <chr>     <dbl>     <dbl>   <dbl>
 1 Afghanistan Asia       1952      779.    28.8
 2 Afghanistan Asia       1957      821.    30.3
 3 Afghanistan Asia       1962      853.    32.0
 4 Afghanistan Asia       1967      836.    34.0
 5 Afghanistan Asia       1972      740.    36.1
 6 Afghanistan Asia       1977      786.    38.4
 7 Afghanistan Asia       1982      978.    39.9
 8 Afghanistan Asia       1987      852.    40.8
 9 Afghanistan Asia       1992      649.    41.7
10 Afghanistan Asia       1997      635.    41.8

Answer: The .value argument splits column names into multiple variables so GDP per capita and life expectancy become separate columns. This creates a tidy dataset.

Filter Data

gap_filtered <- gap_tidy %>%
  filter(
    year >= 1970,
    country %in% c(
      "Turkey",
      "Brazil",
      "Korea, Rep.",
      "Germany",
      "United States",
      "China"
    )
  )
gap_filtered
# A tibble: 48 × 5
   country continent  year gdpPercap lifeExp
   <chr>   <chr>     <dbl>     <dbl>   <dbl>
 1 Brazil  Americas   1972     4986.    59.5
 2 Brazil  Americas   1977     6660.    61.5
 3 Brazil  Americas   1982     7031.    63.3
 4 Brazil  Americas   1987     7807.    65.2
 5 Brazil  Americas   1992     6950.    67.1
 6 Brazil  Americas   1997     7958.    69.4
 7 Brazil  Americas   2002     8131.    71.0
 8 Brazil  Americas   2007     9066.    72.4
 9 China   Asia       1972      677.    63.1
10 China   Asia       1977      741.    64.0
# ℹ 38 more rows

Continent Summary

continent_summary <- gap_tidy %>%
  group_by(continent) %>%
  summarize(
    avg_gdp = mean(gdpPercap, na.rm = TRUE),
    avg_lifeExp = mean(lifeExp, na.rm = TRUE)
  ) %>%
  arrange(desc(avg_gdp))

continent_summary
# A tibble: 5 × 3
  continent avg_gdp avg_lifeExp
  <chr>       <dbl>       <dbl>
1 Oceania    18622.        74.3
2 Europe     14469.        71.9
3 Asia        7902.        60.1
4 Americas    7136.        64.7
5 Africa      2194.        48.9

Answer: Europe and the Americas generally have higher GDP per capita and life expectancy due to higher development levels and better healthcare systems.

Top 5 GDP Countries

top5_gdp <- gap_tidy %>%
  group_by(country) %>%
  summarize(avg_gdp = mean(gdpPercap, na.rm = TRUE)) %>%
  arrange(desc(avg_gdp)) %>%
  slice_head(n = 5)

top5_gdp
# A tibble: 5 × 2
  country       avg_gdp
  <chr>           <dbl>
1 Kuwait         65333.
2 Switzerland    27074.
3 Norway         26747.
4 United States  26261.
5 Canada         22411.

Answer: Small and wealthy countries appear at the top because GDP per capita measures income per person rather than total economic size.

Correlation Analysis

continent_correlations <- gap_tidy %>%
  group_by(continent) %>%
  summarize(
    correlation = cor(gdpPercap, lifeExp, use = "complete.obs")
  ) %>%
  arrange(desc(correlation))

continent_correlations
# A tibble: 5 × 2
  continent correlation
  <chr>           <dbl>
1 Oceania         0.956
2 Europe          0.781
3 Americas        0.558
4 Africa          0.426
5 Asia            0.382

Answer: There is a positive relationship between GDP per capita and life expectancy across all continents, but the strength of the relationship varies.

Data Join

gap_life <- read_csv("gap_life.csv")
Rows: 1618 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): country
dbl (2): year, lifeExp

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gap_gdp <- read_csv("gap_gdp.csv")
Rows: 1618 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): country
dbl (2): year, gdpPercap

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(gap_life)
Rows: 1,618
Columns: 3
$ country <chr> "Mali", "Malaysia", "Zambia", "Greece", "Swaziland", "Iran", "…
$ year    <dbl> 1992, 1967, 1987, 2002, 1967, 1997, 2007, 2007, 1957, 2002, 19…
$ lifeExp <dbl> 48.388, 59.371, 50.821, 78.256, 46.633, 68.042, 73.747, 78.098…
glimpse(gap_gdp)
Rows: 1,618
Columns: 3
$ country   <chr> "Bangladesh", "Mongolia", "Taiwan", "Burkina Faso", "Angola"…
$ year      <dbl> 1987, 1997, 2002, 1962, 1962, 1977, 2007, 1962, 1992, 1972, …
$ gdpPercap <dbl> 751.9794, 1902.2521, 23235.4233, 722.5120, 4269.2767, 2785.4…
gap_joined <- inner_join(
  gap_life,
  gap_gdp,
  by = c("country", "year")
)

gap_joined
# A tibble: 1,535 × 4
   country    year lifeExp gdpPercap
   <chr>     <dbl>   <dbl>     <dbl>
 1 Mali       1992    48.4      739.
 2 Malaysia   1967    59.4     2278.
 3 Zambia     1987    50.8     1213.
 4 Greece     2002    78.3    22514.
 5 Swaziland  1967    46.6     2613.
 6 Iran       1997    68.0     8264.
 7 Venezuela  2007    73.7    11416.
 8 Portugal   2007    78.1    20510.
 9 Sweden     1957    72.5     9912.
10 Brazil     2002    71.0     8131.
# ℹ 1,525 more rows
nrow(gap_joined)
[1] 1535
n_distinct(gap_joined$country)
[1] 142

Answer:The joined dataset has fewer rows because only matching country-year combinations exist in both datasets.

Missing Values

gap_joined %>%
  filter(is.na(lifeExp) | is.na(gdpPercap))
# A tibble: 0 × 4
# ℹ 4 variables: country <chr>, year <dbl>, lifeExp <dbl>, gdpPercap <dbl>

Answer:Missing values can be removed or imputed, but both approaches may introduce bias or reduce accuracy.

Economic Interpretation

Answer: GDP per capita and life expectancy generally increase together across countries. Europe shows the highest values, while Africa has lower averages. There is a strong positive relationship between income and health outcomes, but it is not perfect due to inequality and structural differences. Limitations include missing data and the use of only two variables, which does not capture education or healthcare quality. Despite this, the dataset clearly shows global development patterns.