Goal:1 Business Scenario

library(gapminder)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
colnames(gapminder)
## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"
gapminder |> group_by(year, continent) |> summarise(count = n())
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 60 × 3
## # Groups:   year [12]
##     year continent count
##    <int> <fct>     <int>
##  1  1952 Africa       52
##  2  1952 Americas     25
##  3  1952 Asia         33
##  4  1952 Europe       30
##  5  1952 Oceania       2
##  6  1957 Africa       52
##  7  1957 Americas     25
##  8  1957 Asia         33
##  9  1957 Europe       30
## 10  1957 Oceania       2
## # ℹ 50 more rows
gapminder |> filter(continent == 'Asia')
## # A tibble: 396 × 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ℹ 386 more rows

Analysis

  • group by continents/countries and correlation between gdp-lifeexp

  • group by year continent country wise deviation of gdp and life exp

  • Bootstrapping for main top

  • Plot trends for different countries based on the above analysis and findings

gapminder |> filter(country == "Afghanistan") |> ggplot(mapping = aes(x = year, y = lifeExp)) + geom_col() + theme_classic()

#group by continents/countries and correlation between gdp-lifeexp

# correlation at the continent level
continent_correlation <- gapminder %>%
  group_by(continent) %>%
  summarize(correlation = cor(gdpPercap, lifeExp, use = "complete.obs"))

# correlation at the country level
country_correlation <- gapminder %>%
  group_by(country) %>%
  summarize(correlation = cor(gdpPercap, lifeExp, use = "complete.obs"))

print("Continent-Level Correlations:")
## [1] "Continent-Level Correlations:"
print(continent_correlation)
## # A tibble: 5 × 2
##   continent correlation
##   <fct>           <dbl>
## 1 Africa          0.426
## 2 Americas        0.558
## 3 Asia            0.382
## 4 Europe          0.781
## 5 Oceania         0.956
print("Country-Level Correlations:")
## [1] "Country-Level Correlations:"
print(country_correlation)
## # A tibble: 142 × 2
##    country     correlation
##    <fct>             <dbl>
##  1 Afghanistan     -0.0475
##  2 Albania          0.837 
##  3 Algeria          0.904 
##  4 Angola          -0.301 
##  5 Argentina        0.832 
##  6 Australia        0.986 
##  7 Austria          0.993 
##  8 Bahrain          0.898 
##  9 Bangladesh       0.847 
## 10 Belgium          0.993 
## # ℹ 132 more rows

#group by year continent country wise deviation of gdp and life exp

gapminder_deviation <- gapminder %>%
  group_by(year, continent) %>%
  mutate(
    gdp_deviation = gdpPercap - mean(gdpPercap, na.rm = TRUE),
    lifeExp_deviation = lifeExp - mean(lifeExp, na.rm = TRUE)
  )

head(gapminder_deviation)
## # A tibble: 6 × 8
## # Groups:   year, continent [6]
##   country     continent  year lifeExp      pop gdpPercap gdp_deviation
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>         <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.        -4416.
## 2 Afghanistan Asia       1957    30.3  9240934      821.        -4967.
## 3 Afghanistan Asia       1962    32.0 10267083      853.        -4876.
## 4 Afghanistan Asia       1967    34.0 11537966      836.        -5135.
## 5 Afghanistan Asia       1972    36.1 13079460      740.        -7447.
## 6 Afghanistan Asia       1977    38.4 14880372      786.        -7005.
## # ℹ 1 more variable: lifeExp_deviation <dbl>
bootstrap_correlation <- function(data, n_iterations = 1000) {
  correlations <- replicate(n_iterations, {
    sample_data <- data %>% sample_frac(replace = TRUE)
    cor(sample_data$gdpPercap, sample_data$lifeExp, use = "complete.obs")
  })
  quantile(correlations, c(0.025, 0.975))  # Return 95% confidence interval
}

# confidence intervals for each continent
continent_bootstrap <- gapminder %>%
  group_by(continent) %>%
  summarise(confidence_interval = list(bootstrap_correlation(ungroup(cur_data()))))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `confidence_interval =
##   list(bootstrap_correlation(ungroup(cur_data())))`.
## ℹ In group 1: `continent = Africa`.
## Caused by warning:
## ! `cur_data()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.
continent_bootstrap <- continent_bootstrap %>%
  mutate(
    lower_bound = sapply(confidence_interval, function(x) x[1]),
    upper_bound = sapply(confidence_interval, function(x) x[2])
  ) %>%
  select(continent, lower_bound, upper_bound)

print(continent_bootstrap)
## # A tibble: 5 × 3
##   continent lower_bound upper_bound
##   <fct>           <dbl>       <dbl>
## 1 Africa          0.367       0.499
## 2 Americas        0.526       0.597
## 3 Asia            0.286       0.535
## 4 Europe          0.746       0.820
## 5 Oceania         0.931       0.982
selected_countries <- c("Afghanistan", "China", "USA", "Germany")
gapminder %>%
  filter(country %in% selected_countries) %>%
  ggplot(aes(x = year, y = lifeExp, color = country)) +
  geom_line() +
  theme_classic() +
  labs(
    title = "Life Expectancy Trends for Selected Countries",
    x = "Year",
    y = "Life Expectancy"
  )

gapminder |> filter(continent == "Africa") |> ggplot(mapping = aes(x = year, y = lifeExp)) + geom_col() + theme_classic()

gapminder |> filter(country  == 'USA') 
## # A tibble: 0 × 6
## # ℹ 6 variables: country <fct>, continent <fct>, year <int>, lifeExp <dbl>,
## #   pop <int>, gdpPercap <dbl>
  • Bootstrapping the top countries and get the information and its statistics based on it like p-value

Goal 3: Ethical and Epistemological Concerns

  • pop increases reduces impact of the funds

Overcoming biases (existing or potential).

  • some counties where added late and what could the possible bias

Possible risks or societal implications.

Crucial issues which might not be measurable.

Who would be affected by this project, and how does that affect your critique?