AI in Public Health

Course Code: COMH7306A

Author
Affiliation

Vusumuzi Mabasa

University Of the Witwatersrand (School of Public Health)

Published

September 2, 2025

Used the gapminder dataset (country, continent, year, lifeExp, pop, gdpPercap).

library(gapminder)
library(dplyr)
library(ggplot2)

data(gapminder)       # makes the 'gapminder' tibble available
glimpse(gapminder)    # quick peek at columns
#> Rows: 1,704
#> Columns: 6
#> $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
#> $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
#> $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
#> $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
#> $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
#> $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

Selected relevant variables, filtered to South Africa & Ireland for comparison. Then a T-tes was conducted to compare mean life expectancy (lifeExp) between those two countries using t.test(lifeExp ~ country, data = df1).

library(dplyr)

# Subset for South Africa & Ireland
df1 <- gapminder %>%
  filter(country %in% c("South Africa", "Ireland"))

# Run t-test (Welch by default)
t_test_result <- t.test(lifeExp ~ country, data = df1)
t_test_result
#> 
#>  Welch Two Sample t-test
#> 
#> data:  lifeExp by country
#> t = 10.067, df = 19.109, p-value = 4.466e-09
#> alternative hypothesis: true difference in means between group Ireland and group South Africa is not equal to 0
#> 95 percent confidence interval:
#>  15.07022 22.97794
#> sample estimates:
#>      mean in group Ireland mean in group South Africa 
#>                   73.01725                   53.99317

Visualization with ggplot2: Boxplot/scatterplot to show differences

# Focus on a few columns for demos
ggplot(df1, aes(x = country, y = lifeExp, fill = country)) +
  geom_boxplot(alpha = 0.6) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(title = "Life Expectancy: South Africa vs Ireland",
       y = "Life Expectancy (years)", x = "") +
  theme_minimal() +
  theme(legend.position = "none")

Linear regression: Modeled life expectancy (lifeExp) as a function of GDP per capita (gdpPercap), first simple, then extended.

gm_filtered <- gapminder %>%
  filter(gdpPercap < 50000)   # drop extreme outliers

model1 <- lm(lifeExp ~ gdpPercap, data = gm_filtered)
summary(model1)
#> 
#> Call:
#> lm(formula = lifeExp ~ gdpPercap, data = gm_filtered)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -29.8604  -7.3390   0.8608   7.7148  19.5502 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 5.209e+01  2.969e-01  175.44   <2e-16 ***
#> gdpPercap   1.067e-03  2.747e-05   38.85   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 9.412 on 1696 degrees of freedom
#> Multiple R-squared:  0.4708, Adjusted R-squared:  0.4705 
#> F-statistic:  1509 on 1 and 1696 DF,  p-value: < 2.2e-16
model2 <- lm(lifeExp ~ log10(gdpPercap), data = gm_filtered)
summary(model2)
#> 
#> Call:
#> lm(formula = lifeExp ~ log10(gdpPercap), data = gm_filtered)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -29.6130  -4.0369   0.9468   4.5145  19.6084 
#> 
#> Coefficients:
#>                  Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)      -10.5665     1.2184  -8.672   <2e-16 ***
#> log10(gdpPercap)  19.7920     0.3405  58.125   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 7.48 on 1696 degrees of freedom
#> Multiple R-squared:  0.6658, Adjusted R-squared:  0.6656 
#> F-statistic:  3378 on 1 and 1696 DF,  p-value: < 2.2e-16
gm_model <- gm_filtered %>%
  mutate(
    lgdp = log10(gdpPercap),
    lpop = log10(pop)
  )

model3 <- lm(lifeExp ~ lgdp + lpop + continent + year, data = gm_model)
summary(model3)
#> 
#> Call:
#> lm(formula = lifeExp ~ lgdp + lpop + continent + year, data = gm_model)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -24.8602  -3.2748   0.3171   3.6625  15.2558 
#> 
#> Coefficients:
#>                     Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)       -4.524e+02  1.697e+01 -26.660   <2e-16 ***
#> lgdp               1.210e+01  3.827e-01  31.609   <2e-16 ***
#> lpop               2.617e-01  2.222e-01   1.178    0.239    
#> continentAmericas  8.531e+00  4.763e-01  17.909   <2e-16 ***
#> continentAsia      6.902e+00  4.213e-01  16.380   <2e-16 ***
#> continentEurope    1.194e+01  5.315e-01  22.463   <2e-16 ***
#> continentOceania   1.211e+01  1.278e+00   9.478   <2e-16 ***
#> year               2.331e-01  8.936e-03  26.090   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 5.78 on 1690 degrees of freedom
#> Multiple R-squared:  0.8011, Adjusted R-squared:  0.8003 
#> F-statistic: 972.5 on 7 and 1690 DF,  p-value: < 2.2e-16

Plotted regression line with geom_smooth(method = “lm”).

ggplot(gm_filtered, aes(x = log10(gdpPercap), y = lifeExp)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Life Expectancy vs log10(GDP per Capita)",
       x = "log10(GDP per Capita)", y = "Life Expectancy (years)") +
  theme_minimal()

gapminder %>%
  filter(gdpPercap < 50000) %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Life Expectancy vs. GDP per Capita")