library(gapminder)
library(dplyr)
library(ggplot2)
data(gapminder) # makes the 'gapminder' tibble available
glimpse(gapminder) # quick peek at columns
#> Rows: 1,704
#> Columns: 6
#> $ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
#> $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
#> $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
#> $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
#> $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
#> $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
Used the gapminder dataset (country, continent, year, lifeExp, pop, gdpPercap).
Selected relevant variables, filtered to South Africa & Ireland for comparison. Then a T-tes was conducted to compare mean life expectancy (lifeExp) between those two countries using t.test(lifeExp ~ country, data = df1).
library(dplyr)
# Subset for South Africa & Ireland
<- gapminder %>%
df1 filter(country %in% c("South Africa", "Ireland"))
# Run t-test (Welch by default)
<- t.test(lifeExp ~ country, data = df1)
t_test_result
t_test_result#>
#> Welch Two Sample t-test
#>
#> data: lifeExp by country
#> t = 10.067, df = 19.109, p-value = 4.466e-09
#> alternative hypothesis: true difference in means between group Ireland and group South Africa is not equal to 0
#> 95 percent confidence interval:
#> 15.07022 22.97794
#> sample estimates:
#> mean in group Ireland mean in group South Africa
#> 73.01725 53.99317
Visualization with ggplot2: Boxplot/scatterplot to show differences
# Focus on a few columns for demos
ggplot(df1, aes(x = country, y = lifeExp, fill = country)) +
geom_boxplot(alpha = 0.6) +
geom_jitter(width = 0.2, alpha = 0.5) +
labs(title = "Life Expectancy: South Africa vs Ireland",
y = "Life Expectancy (years)", x = "") +
theme_minimal() +
theme(legend.position = "none")
Linear regression: Modeled life expectancy (lifeExp) as a function of GDP per capita (gdpPercap), first simple, then extended.
<- gapminder %>%
gm_filtered filter(gdpPercap < 50000) # drop extreme outliers
<- lm(lifeExp ~ gdpPercap, data = gm_filtered)
model1 summary(model1)
#>
#> Call:
#> lm(formula = lifeExp ~ gdpPercap, data = gm_filtered)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -29.8604 -7.3390 0.8608 7.7148 19.5502
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 5.209e+01 2.969e-01 175.44 <2e-16 ***
#> gdpPercap 1.067e-03 2.747e-05 38.85 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 9.412 on 1696 degrees of freedom
#> Multiple R-squared: 0.4708, Adjusted R-squared: 0.4705
#> F-statistic: 1509 on 1 and 1696 DF, p-value: < 2.2e-16
<- lm(lifeExp ~ log10(gdpPercap), data = gm_filtered)
model2 summary(model2)
#>
#> Call:
#> lm(formula = lifeExp ~ log10(gdpPercap), data = gm_filtered)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -29.6130 -4.0369 0.9468 4.5145 19.6084
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -10.5665 1.2184 -8.672 <2e-16 ***
#> log10(gdpPercap) 19.7920 0.3405 58.125 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 7.48 on 1696 degrees of freedom
#> Multiple R-squared: 0.6658, Adjusted R-squared: 0.6656
#> F-statistic: 3378 on 1 and 1696 DF, p-value: < 2.2e-16
<- gm_filtered %>%
gm_model mutate(
lgdp = log10(gdpPercap),
lpop = log10(pop)
)
<- lm(lifeExp ~ lgdp + lpop + continent + year, data = gm_model)
model3 summary(model3)
#>
#> Call:
#> lm(formula = lifeExp ~ lgdp + lpop + continent + year, data = gm_model)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -24.8602 -3.2748 0.3171 3.6625 15.2558
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -4.524e+02 1.697e+01 -26.660 <2e-16 ***
#> lgdp 1.210e+01 3.827e-01 31.609 <2e-16 ***
#> lpop 2.617e-01 2.222e-01 1.178 0.239
#> continentAmericas 8.531e+00 4.763e-01 17.909 <2e-16 ***
#> continentAsia 6.902e+00 4.213e-01 16.380 <2e-16 ***
#> continentEurope 1.194e+01 5.315e-01 22.463 <2e-16 ***
#> continentOceania 1.211e+01 1.278e+00 9.478 <2e-16 ***
#> year 2.331e-01 8.936e-03 26.090 <2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 5.78 on 1690 degrees of freedom
#> Multiple R-squared: 0.8011, Adjusted R-squared: 0.8003
#> F-statistic: 972.5 on 7 and 1690 DF, p-value: < 2.2e-16
Plotted regression line with geom_smooth(method = “lm”).
ggplot(gm_filtered, aes(x = log10(gdpPercap), y = lifeExp)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Life Expectancy vs log10(GDP per Capita)",
x = "log10(GDP per Capita)", y = "Life Expectancy (years)") +
theme_minimal()
%>%
gapminder filter(gdpPercap < 50000) %>%
ggplot(aes(x = gdpPercap, y = lifeExp)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Life Expectancy vs. GDP per Capita")