Life expectancy is a key indicator of population health, often linked to economic and social development. This study examines how changes in GDP per capita, healthcare expenditure, tertiary education enrolment, and urbanisation have been associated with life expectancy in the United Kingdom from 1971 to 2019. Using annual data from Our World in Data, we employed OLS regression models to explore long-run trends. After differencing to meet model assumptions, no significant short-term associations were found, suggesting that improvements in life expectancy reflect gradual, cumulative processes rather than immediate fluctuations in socioeconomic factors.
library(readxl)
life_expectancy <- read_excel("~/Desktop/life-expectancy-true.xlsx",
range = "A2:D51")
share_of_population_urban <- read_excel("~/Desktop/share-of-population-urban-true.xlsx",
range = "A2:D51")
gdp_per_capita <- read_excel("~/Desktop/gdp-per-capita-maddison-project-database-true.xlsx",
range = "A2:D51")
health_expenditure_and_financing_per_capita <- read_excel("~/Desktop/health-expenditure-and-financing-per-capita-true.xlsx",
range = "A2:D51")
primary_secondary_enrollment_completion_rates <- read_excel("~/Desktop/primary-secondary-enrollment-completion-rates-true.xlsx",
range = "A2:F51")
#joining the data
library(dplyr)
unfilteredqmdataset <- life_expectancy |>
left_join(gdp_per_capita, by= "Year") |>
left_join(health_expenditure_and_financing_per_capita, by= "Year") |>
left_join(primary_secondary_enrollment_completion_rates, by= "Year") |>
left_join(share_of_population_urban, by="Year")
new_healthdf <- unfilteredqmdataset %>%
dplyr::select(Year, `GDP per capita`, `Gross enrolment ratio in tertiary education`,
`Health expenditure per capita - Total`, `Urban population (% of total population)`,
`Period life expectancy at birth`,)
new_healthdf <- as.data.frame(new_healthdf)
2)We checked that the data frame looks accurate, ensuring it is assigned as a data frame. Then we renamed each column to an abbreviated version for ease of use.
new_healthdf <- as.data.frame(new_healthdf)
head(new_healthdf)
## Year GDP per capita Gross enrolment ratio in tertiary education
## 1 1971 17440 15.05809
## 2 1972 18002 15.68609
## 3 1973 19168 15.99295
## 4 1974 18903 16.43138
## 5 1975 18884 18.02605
## 6 1976 19311 18.51698
## Health expenditure per capita - Total
## 1 753.588
## 2 800.258
## 3 833.654
## 4 917.653
## 5 970.169
## 6 997.355
## Urban population (% of total population) Period life expectancy at birth
## 1 77.030 72.2340
## 2 77.195 71.9882
## 3 77.358 72.1846
## 4 77.521 72.3760
## 5 77.683 72.6500
## 6 77.844 72.6341
library(ggplot2)
class(new_healthdf) # new_healthdf is a dataframe
names(new_healthdf)
# shortening the column names:
colnames(new_healthdf) <- c("year", "gdp.pc","edu.enrol","health.exp",
"urban.pop","life.exp")
head(new_healthdf)
## year gdp.pc edu.enrol health.exp urban.pop life.exp
## 1 1971 17440 15.05809 753.588 77.030 72.2340
## 2 1972 18002 15.68609 800.258 77.195 71.9882
## 3 1973 19168 15.99295 833.654 77.358 72.1846
## 4 1974 18903 16.43138 917.653 77.521 72.3760
## 5 1975 18884 18.02605 970.169 77.683 72.6500
## 6 1976 19311 18.51698 997.355 77.844 72.6341
# creating a descriptive table
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer (new_healthdf[, c(2:6)], type ="text",
title="Table 1: Desciptive statistics", digits = 1)
##
## Table 1: Desciptive statistics
## =================================================
## Statistic N Mean St. Dev. Min Max
## -------------------------------------------------
## gdp.pc 49 28,416.4 6,933.9 17,440.0 39,113.0
## edu.enrol 49 40.6 18.8 15.1 65.5
## health.exp 49 2,498.4 1,430.0 753.6 4,959.7
## urban.pop 49 79.4 1.8 77.0 83.7
## life.exp 49 76.8 3.0 72.0 81.4
## -------------------------------------------------
The next stage of the process involved creating bivariate models for each independent variable against the dependent variable, to check for isolated correlations and that model assumptions would be met for ordinary least squares regression.
# ===========================================================
# --- OLS MODELS ---
# ===========================================================
# --- URBAN POPULATION VARIABLE ---
# ===========================================================
# Starting with a simple bivariate regression (life.exp ~ urban.pop) to observe the raw correlation
library(HistData)
library(tidyverse)
# create an initial OLS model and save the results of model into the object mod1
mod1<-lm(life.exp~urban.pop, data=new_healthdf)
# a summary of model result
summary(mod1)
##
## Call:
## lm(formula = life.exp ~ urban.pop, data = new_healthdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1053 -1.5481 0.1166 1.2942 2.2189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -41.1004 8.8495 -4.644 2.77e-05 ***
## urban.pop 1.4857 0.1115 13.329 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.404 on 47 degrees of freedom
## Multiple R-squared: 0.7908, Adjusted R-squared: 0.7863
## F-statistic: 177.7 on 1 and 47 DF, p-value: < 2.2e-16
# creating a model summary table
stargazer(mod1, type = "text",
title ="Table 2: An OLS model of %Urban Population on Life Expectancy",
dep.var.labels = "Life Expectancy",
covariate.labels = "%Urban Population")
##
## Table 2: An OLS model of %Urban Population on Life Expectancy
## ===============================================
## Dependent variable:
## ---------------------------
## Life Expectancy
## -----------------------------------------------
## %Urban Population 1.486***
## (0.111)
##
## Constant -41.100***
## (8.850)
##
## -----------------------------------------------
## Observations 49
## R2 0.791
## Adjusted R2 0.786
## Residual Std. Error 1.404 (df = 47)
## F Statistic 177.651*** (df = 1; 47)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Initial results showed a strong, significant correlation between the independent and dependent variable, and high R-sqaured and adjusted R-squared values; for every 1% increase in urban population share, life expectancy increased by about 1.5 years, at a 99% significance level.
However, for this correlation to be valid we had to test it met the range of model assumptions for OLS, including linearity, homoskedasticity, independence/ no autocorrelation, and normality.
# --- OLS MODEL DIAGNOSTICS ---
# VISUAL INSPECTION: Residuals vs Fitted plot, Normal Q-Q plot, Scale-Location, Residuals vs leverage
# put four plots together using par(mfrow( )) and produce four standard plots
par(mfrow=c(1,2))
plot(mod1)
# Trying solutions to fix the non-linearity problem:
# -- i) data points might be autocorrelated, adding Year as a control variable: --
mod3 <- lm(life.exp ~ urban.pop + year, data = new_healthdf)
plot(mod3, which=1)
# Adding Year helped remove the strong linear trend, but it looks like the effect of time is still non-linear
library(performance)
library(gvlma)
check_autocorrelation(mod3) # Warning: Autocorrelated residuals detected (p < .001)
## Warning: Autocorrelated residuals detected (p < .001).
# ===========================================================
# Looking at the impact of "GDP per Capita"'s on "Period life expectancy at birth"
# Creating an OLS model
mod_gdp <-lm(life.exp~gdp.pc, data=new_healthdf)
# creating a model summary table
stargazer(mod_gdp, type = "text",
title ="An OLS model of 'GDP per Capita' on 'Life Expectancy'",
dep.var.labels = "Life Expectancy",
covariate.labels = "GDP per Capita")
##
## An OLS model of 'GDP per Capita' on 'Life Expectancy'
## ===============================================
## Dependent variable:
## ---------------------------
## Life Expectancy
## -----------------------------------------------
## GDP per Capita 0.0004***
## (0.00001)
##
## Constant 64.576***
## (0.339)
##
## -----------------------------------------------
## Observations 49
## R2 0.967
## Adjusted R2 0.966
## Residual Std. Error 0.557 (df = 47)
## F Statistic 1,383.460*** (df = 1; 47)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Again, we see a significant correlation between GDP per capita and life expectancy, however it is a weak one (0.0004). We also have very high R-squared values.
We then tested the model assumptions. The assumption failures here are a bit less obvious, so we ran further specific tests for normality, autocorrelation and homoskedasticity.
# --- OLS MODEL DIAGNOSTICS ---
# VISUAL INSPECTION: Residuals vs Fitted plot, Normal Q-Q plot, Scale-Location, Residuals vs leverage
# Producing four standard plots
plot(mod_gdp)
# -- Individual Assumption Checks --
check_normality(mod_gdp) #OK: residuals appear as normally distributed (p = 0.469).
## OK: residuals appear as normally distributed (p = 0.469).
check_autocorrelation(mod_gdp) #Warning: Autocorrelated residuals detected (p < .001).
## Warning: Autocorrelated residuals detected (p < .001).
check_heteroscedasticity(mod_gdp) #Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.012).
## Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.012).
# --- OVERALL ASSUMPTIONS EVALUATION ---
gvlma(mod_gdp) #Heteroscedasticity assumption not satisfied!
##
## Call:
## lm(formula = life.exp ~ gdp.pc, data = new_healthdf)
##
## Coefficients:
## (Intercept) gdp.pc
## 6.458e+01 4.309e-04
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = mod_gdp)
##
## Value p-value Decision
## Global Stat 7.306174 0.12057 Assumptions acceptable.
## Skewness 0.443826 0.50528 Assumptions acceptable.
## Kurtosis 1.076207 0.29955 Assumptions acceptable.
## Link Function 0.002536 0.95984 Assumptions acceptable.
## Heteroscedasticity 5.783604 0.01618 Assumptions NOT satisfied!
We can see that heteroskedasticity assumptions are not satisfied, so again the simple bivariate model is not appropriate for our data.
Next, we looked at life expectancy and health expenditure per capita, creating the OLS model. As with GDP per capita, we observe a weak yet significant correlation; every $1 increase in GDP per capita leads to 0.002 years increase in life expectancy- so roughly a day.
# --- FOLLOWING THE STEPS FOR HEALTH EXPENDITURE VARIABLE ---
# ===========================================================
# Looking at the impact of "Health expenditure per capita" on "Period life expectancy at birth"
# Starting with a simple bivariate regression (life.exp ~ health.exp) to observe the raw correlation
# Creating an OLS model
mod_healthexp <-lm(life.exp~health.exp, data=new_healthdf)
# creating a model summary table
stargazer(mod_healthexp, type = "text",
title ="An OLS model of 'Health expenditure per capita' on 'Life Expectancy'",
dep.var.labels = "Life Expectancy",
covariate.labels = "Health expenditure per capita")
##
## An OLS model of 'Health expenditure per capita' on 'Life Expectancy'
## =========================================================
## Dependent variable:
## ---------------------------
## Life Expectancy
## ---------------------------------------------------------
## Health expenditure per capita 0.002***
## (0.0001)
##
## Constant 71.646***
## (0.198)
##
## ---------------------------------------------------------
## Observations 49
## R2 0.950
## Adjusted R2 0.949
## Residual Std. Error 0.683 (df = 47)
## F Statistic 902.410*** (df = 1; 47)
## =========================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# --- OLS MODEL DIAGNOSTICS ---
# VISUAL INSPECTION: Residuals vs Fitted plot, Normal Q-Q plot, Scale-Location, Residuals vs leverage
# Producing four standard plots
plot(mod_healthexp)
# -- Individual Assumption Checks --
check_normality(mod_healthexp) #OK: residuals appear as normally distributed
## OK: residuals appear as normally distributed (p = 0.508).
check_autocorrelation(mod_healthexp) #Warning: Autocorrelated residuals detected
## Warning: Autocorrelated residuals detected (p < .001).
check_heteroscedasticity(mod_healthexp) #Warning: Heteroscedasticity (non-constant error variance) detected
## Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.001).
# --- OVERALL ASSUMPTIONS EVALUATION ---
gvlma(mod_healthexp) #first model, we can see that the some assumptions are not satisfied
##
## Call:
## lm(formula = life.exp ~ health.exp, data = new_healthdf)
##
## Coefficients:
## (Intercept) health.exp
## 71.645522 0.002071
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = mod_healthexp)
##
## Value p-value Decision
## Global Stat 28.47380 9.997e-06 Assumptions NOT satisfied!
## Skewness 0.03282 8.562e-01 Assumptions acceptable.
## Kurtosis 0.77532 3.786e-01 Assumptions acceptable.
## Link Function 18.32189 1.866e-05 Assumptions NOT satisfied!
## Heteroscedasticity 9.34376 2.237e-03 Assumptions NOT satisfied!
# --- FOLLOWING THE STEPS FOR ENROLMENT RATIO IN TERTIARY EDU VARIABLE ---
# ========================================================================
# Looking at the impact of "Gross enrolment ratio in tertiary edu"'s on "Period life expectancy at birth"
# Starting with a simple bivariate regression (life.exp ~ urban.pop) to observe the raw correlation
# Creating an OLS model
mod_edu <-lm(life.exp~edu.enrol, data=new_healthdf)
# creating a model summary table
stargazer(mod_edu, type = "text",
title ="An OLS model of 'Gross enrolment ratio in tertiary education' on 'Life Expectancy'",
dep.var.labels = "Life Expectancy",
covariate.labels = "Gross enrol. ratio in tertiary edu")
##
## An OLS model of 'Gross enrolment ratio in tertiary education' on 'Life Expectancy'
## ==============================================================
## Dependent variable:
## ---------------------------
## Life Expectancy
## --------------------------------------------------------------
## Gross enrol. ratio in tertiary edu 0.151***
## (0.009)
##
## Constant 70.698***
## (0.379)
##
## --------------------------------------------------------------
## Observations 49
## R2 0.870
## Adjusted R2 0.867
## Residual Std. Error 1.106 (df = 47)
## F Statistic 314.929*** (df = 1; 47)
## ==============================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Here we again see a significant correlation at the 99% signifigance level, showing that a 1% increase in the post-secondary enrolment ratio leads to 0.15 years increase in life expectancy. The R-squared values are also very high.
However, again, several OLS assumptions are violated, as shown in the residuals plots. The curved graph in the Residuals vs Fitted plot means linearity assumption is not met.
# VISUAL INSPECTION: Residuals vs Fitted plot, Normal Q-Q plot, Scale-Location, Residuals vs leverage
# Producing four standard plots
plot(mod_edu)
Differencing allows us to stabilise variance over time, computing the difference between consecutive observations, and this can reduce the issues with autocorrelation. After differencing the data, we found that each of the indpendent variable met the OLS assumptions.
Source: Rob J Hyndman and George Athanasopoulos (2025). 8.1 Stationarity and differencing | Forecasting: Principles and Practice (2nd ed). [online] Otexts.com. Available at: https://otexts.com/fpp2/stationarity.html#differencing [Accessed 3 Nov. 2025].
# -- ii) trying differencing to focus on year-to-year changes --
# This answers: Do year-to-year changes in urban population explain year-to-year changes in life expectancy?
new_healthdf$diff_LE <- c(NA, diff(new_healthdf$life.exp))
new_healthdf$diff_Urban <- c(NA, diff(new_healthdf$urban.pop))
healthdf_diff <- na.omit(new_healthdf[, c("diff_LE", "diff_Urban")])
model_diff <- lm(diff_LE ~ diff_Urban, data = healthdf_diff)
plot(model_diff, which = 1)
#After removing long-term time trends through first differencing, the model residuals show no clear pattern
#indicating that the assumptions of linearity and homoscedasticity are satisfied.
check_autocorrelation(model_diff) #OK: Residuals appear to be independent and not autocorrelated (p = 0.094).
## OK: Residuals appear to be independent and not autocorrelated (p = 0.068).
# using the differencing model for the following diagnostics checks
# --- 2. Normal Q-Q Plot ---
plot(model_diff, which = 2) #normality assumption is met
# --- 3. Scale-Location Plot ---
plot(model_diff, which=3)
# --- 4. Residuals vs Leverage Plot ---
plot(model_diff,which=5)
# NEW OLS MODEL: model_diff <- lm(diff_LE ~ diff_Urban, data = healthdf_diff)
# creating a model summary table
stargazer(model_diff, type = "text",
title ="Table 2: An OLS model of %Urban Population on Life Expectancy",
dep.var.labels = "Life Expectancy",
covariate.labels = "%Urban Population")
##
## Table 2: An OLS model of %Urban Population on Life Expectancy
## ===============================================
## Dependent variable:
## ---------------------------
## Life Expectancy
## -----------------------------------------------
## %Urban Population -0.072
## (0.229)
##
## Constant 0.202***
## (0.042)
##
## -----------------------------------------------
## Observations 48
## R2 0.002
## Adjusted R2 -0.020
## Residual Std. Error 0.197 (df = 46)
## F Statistic 0.100 (df = 1; 46)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# --- OTHER APPROACHES TO CHECK ASSUMPTIONS ---
# -- Linearity --
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
# Component plus residual plot
crPlots(model_diff)
# -- Normality --
# X-axis grid
x2 <- seq(min(model_diff$residuals), max(model_diff$residuals), length = 50)
# Normal curve
fun <- dnorm(x2, mean = mean(model_diff$residuals), sd = sd(model_diff$residuals))
# Histogram
hist(model_diff$residuals, prob = TRUE, col = "white",
ylim = c(0, max(fun)),
main = "Histogram of model residual with normal curve")
lines(x2, fun, col = 2, lwd = 2)
# run shapiro-Wilk test to test for normality of residual
shapiro.test(model_diff$residuals)
##
## Shapiro-Wilk normality test
##
## data: model_diff$residuals
## W = 0.97844, p-value = 0.5156
# -- Homoskedasticity --
ncvTest(model_diff)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.08713268, Df = 1, p = 0.76785
spreadLevelPlot(model_diff)
##
## Suggested power transformation: -0.1909099
# -- Autocorrelation --
durbinWatsonTest(model_diff)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.3319191 2.546581 0.076
## Alternative hypothesis: rho != 0
plot(model_diff$residuals)
acf(model_diff$residuals)
library(performance)
library(see)
library(patchwork)
check_autocorrelation(model_diff) #OK: Residuals appear to be independent and not autocorrelated (p = 0.096).
## OK: Residuals appear to be independent and not autocorrelated (p = 0.068).
# --- OVERALL ASSUMPTIONS EVALUATION ---
library(gvlma)
gvlma(model_diff) #model with differencing applied, assumptions acceptable
##
## Call:
## lm(formula = diff_LE ~ diff_Urban, data = healthdf_diff)
##
## Coefficients:
## (Intercept) diff_Urban
## 0.2018 -0.0724
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model_diff)
##
## Value p-value Decision
## Global Stat 0.73337 0.9472 Assumptions acceptable.
## Skewness 0.39134 0.5316 Assumptions acceptable.
## Kurtosis 0.06717 0.7955 Assumptions acceptable.
## Link Function 0.04325 0.8353 Assumptions acceptable.
## Heteroscedasticity 0.23161 0.6303 Assumptions acceptable.
# --- OVERALL VISUAL CHECK FOR ASSUMPTIONS ---
library(performance)
library(see)
library(patchwork)
check_model(model_diff)
4) Next, we conducted the differencing method on life expectancy and GDP
per capita. However, it still produced a curved pattern in the residuals
vs fitted plot.
# Trying solutions:
# Trying differencing to focus on year-to-year changes
new_healthdf$diff_LE <- c(NA, diff(new_healthdf$life.exp))
new_healthdf$diff_Gdp <- c(NA, diff(new_healthdf$gdp.pc))
gdp_diff_df <- na.omit(new_healthdf[, c("diff_LE", "diff_Gdp")])
# Run the OLS model for differenced data
model_diff_gdp <- lm(diff_LE ~ diff_Gdp, data = gdp_diff_df)
plot(model_diff_gdp, which = 1) #Still produces a curved pattern (?)
# --- OVERALL ASSUMPTIONS EVALUATION ---
gvlma(model_diff_gdp)
##
## Call:
## lm(formula = diff_LE ~ diff_Gdp, data = gdp_diff_df)
##
## Coefficients:
## (Intercept) diff_Gdp
## 2.275e-01 -7.892e-05
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model_diff_gdp)
##
## Value p-value Decision
## Global Stat 2.266912 0.6868 Assumptions acceptable.
## Skewness 0.262135 0.6087 Assumptions acceptable.
## Kurtosis 0.004502 0.9465 Assumptions acceptable.
## Link Function 1.752104 0.1856 Assumptions acceptable.
## Heteroscedasticity 0.248171 0.6184 Assumptions acceptable.
# Differencing did fix the assumption violations, but there is still autocorrelation
# -- Individual Assumption Checks --
check_normality(model_diff_gdp) #OK: residuals appear as normally distributed (p = 0.661).
## OK: residuals appear as normally distributed (p = 0.661).
check_autocorrelation(model_diff_gdp) #Warning: Autocorrelated residuals detected (p < .040).
## Warning: Autocorrelated residuals detected (p = 0.036).
check_heteroscedasticity(model_diff_gdp) #OK: Error variance appears to be homoscedastic (p = 0.296).
## OK: Error variance appears to be homoscedastic (p = 0.296).
# --- OVERALL VISUAL CHECK FOR ASSUMPTIONS ---
check_model(model_diff_gdp)
6) We decided GDP per capita could still be included in the multivariate
regression in the chance that controlling for other variables would
remove the autocorrelation issue.
# Trying differencing to focus on year-to-year changes
new_healthdf$diff_LE <- c(NA, diff(new_healthdf$life.exp))
new_healthdf$diff_HealthExp <- c(NA, diff(new_healthdf$health.exp))
healthexp_diff_df <- na.omit(new_healthdf[, c("diff_LE", "diff_HealthExp")])
# Run the OLS model for differenced data
model_diff_healthexp <- lm(diff_LE ~ diff_HealthExp, data = healthexp_diff_df)
# --- OVERALL ASSUMPTIONS EVALUATION ---
gvlma(model_diff_healthexp) #assumptions acceptable
##
## Call:
## lm(formula = diff_LE ~ diff_HealthExp, data = healthexp_diff_df)
##
## Coefficients:
## (Intercept) diff_HealthExp
## 0.1645198 0.0003118
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model_diff_healthexp)
##
## Value p-value Decision
## Global Stat 0.74364 0.9458 Assumptions acceptable.
## Skewness 0.25099 0.6164 Assumptions acceptable.
## Kurtosis 0.17233 0.6780 Assumptions acceptable.
## Link Function 0.07927 0.7783 Assumptions acceptable.
## Heteroscedasticity 0.24105 0.6234 Assumptions acceptable.
# -- Individual Assumption Checks --
check_normality(model_diff_healthexp) #OK: residuals appear as normally distributed
## OK: residuals appear as normally distributed (p = 0.694).
check_autocorrelation(model_diff_healthexp) #OK: Residuals appear to be independent and not autocorrelated (p = 0.052).
## Warning: Autocorrelated residuals detected (p = 0.038).
check_heteroscedasticity(model_diff_healthexp) #OK: Error variance appears to be homoscedastic
## OK: Error variance appears to be homoscedastic (p = 0.483).
# --- OVERALL VISUAL CHECK FOR ASSUMPTIONS ---
check_model(model_diff_healthexp)
9) Finally, we looked at the differenced relationship for life
expectancy and the tertiary enrolment ratio.
# ii) Trying differencing to focus on year-to-year changes --
new_healthdf$diff_LE <- c(NA, diff(new_healthdf$life.exp))
new_healthdf$diff_Edu <- c(NA, diff(new_healthdf$edu.enrol))
edu_diff_df <- na.omit(new_healthdf[, c("diff_LE", "diff_Edu")])
# Run the OLS model for differenced data
model_diff_edu <- lm(diff_LE ~ diff_Edu, data = edu_diff_df)
gvlma(model_diff_edu) #model with differencing applied, assumptions acceptable
##
## Call:
## lm(formula = diff_LE ~ diff_Edu, data = edu_diff_df)
##
## Coefficients:
## (Intercept) diff_Edu
## 0.201108 -0.008817
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model_diff_edu)
##
## Value p-value Decision
## Global Stat 0.645980 0.9578 Assumptions acceptable.
## Skewness 0.365841 0.5453 Assumptions acceptable.
## Kurtosis 0.076570 0.7820 Assumptions acceptable.
## Link Function 0.001531 0.9688 Assumptions acceptable.
## Heteroscedasticity 0.202038 0.6531 Assumptions acceptable.
# -- Individual Assumption Checks --
check_normality(model_diff_edu) #OK: residuals appear as normally distributed (p = 0.849).
## OK: residuals appear as normally distributed (p = 0.849).
check_autocorrelation(model_diff_edu) #OK: Residuals appear to be independent and not autocorrelated (p = 0.066).
## OK: Residuals appear to be independent and not autocorrelated (p = 0.076).
check_heteroscedasticity(model_diff_edu) #OK: Error variance appears to be homoscedastic (p = 0.739).
## OK: Error variance appears to be homoscedastic (p = 0.739).
# --- OVERALL VISUAL CHECK FOR ASSUMPTIONS ---
check_model(model_diff_edu)
# This model that uses differencing shows better visual diagnostics
summary(model_diff)
##
## Call:
## lm(formula = diff_LE ~ diff_Urban, data = healthdf_diff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43569 -0.08891 0.00555 0.13242 0.38876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.20183 0.04249 4.750 2.03e-05 ***
## diff_Urban -0.07240 0.22932 -0.316 0.754
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1966 on 46 degrees of freedom
## Multiple R-squared: 0.002162, Adjusted R-squared: -0.01953
## F-statistic: 0.09967 on 1 and 46 DF, p-value: 0.7537
summary(model_diff_gdp)
##
## Call:
## lm(formula = diff_LE ~ diff_Gdp, data = gdp_diff_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42893 -0.08049 0.02094 0.10651 0.40537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.275e-01 3.666e-02 6.205 1.43e-07 ***
## diff_Gdp -7.892e-05 5.308e-05 -1.487 0.144
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1922 on 46 degrees of freedom
## Multiple R-squared: 0.04584, Adjusted R-squared: 0.0251
## F-statistic: 2.21 on 1 and 46 DF, p-value: 0.1439
summary(model_diff_healthexp)
##
## Call:
## lm(formula = diff_LE ~ diff_HealthExp, data = healthexp_diff_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42949 -0.10861 -0.00503 0.11651 0.37241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1645198 0.0400658 4.106 0.000163 ***
## diff_HealthExp 0.0003118 0.0003257 0.958 0.343299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1948 on 46 degrees of freedom
## Multiple R-squared: 0.01954, Adjusted R-squared: -0.001771
## F-statistic: 0.9169 on 1 and 46 DF, p-value: 0.3433
summary(model_diff_edu)
##
## Call:
## lm(formula = diff_LE ~ diff_Edu, data = edu_diff_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44137 -0.09135 -0.00392 0.13347 0.40012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.201108 0.032952 6.103 2.03e-07 ***
## diff_Edu -0.008817 0.016055 -0.549 0.586
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1961 on 46 degrees of freedom
## Multiple R-squared: 0.006513, Adjusted R-squared: -0.01508
## F-statistic: 0.3016 on 1 and 46 DF, p-value: 0.5856
df_multi_diff <- (new_healthdf[, c("diff_LE", "diff_Gdp", "diff_Edu", "diff_HealthExp", "diff_Urban")])
# --- CHECKING FOR MULTICOLLINEARITY
# correlation matrix
cor(df_multi_diff[,1:5])
## diff_LE diff_Gdp diff_Edu diff_HealthExp diff_Urban
## diff_LE 1 NA NA NA NA
## diff_Gdp NA 1 NA NA NA
## diff_Edu NA NA 1 NA NA
## diff_HealthExp NA NA NA 1 NA
## diff_Urban NA NA NA NA 1
model_multi_diff <- lm(diff_LE ~ diff_Gdp + diff_Edu + diff_HealthExp + diff_Urban, data = df_multi_diff)
# Variance Inflation Factor
vif(model_multi_diff) # The VIFs are all around 1 - no multicollinearity
## diff_Gdp diff_Edu diff_HealthExp diff_Urban
## 1.066966 1.157544 1.131227 1.238345
# Variance Inflation Factor
vif(model_multi_diff) # The VIFs are all around 1 - no multicollinearity
## diff_Gdp diff_Edu diff_HealthExp diff_Urban
## 1.066966 1.157544 1.131227 1.238345
# possible models with variables..
Multi_model1<-lm(diff_LE ~ diff_Gdp + diff_Edu , data=df_multi_diff)
Multi_model2<-lm(diff_LE ~ diff_Gdp + diff_Edu + diff_HealthExp, data=df_multi_diff)
Multi_model3<-lm(diff_LE ~ diff_Gdp + diff_Edu + diff_HealthExp + diff_Urban, data=df_multi_diff)
# possible models with variables excluding GDP per capita..
Multi_model4<-lm(diff_LE ~ diff_Edu + diff_HealthExp, data=df_multi_diff)
Multi_model5<-lm(diff_LE ~ diff_Edu + diff_HealthExp + diff_Urban, data=df_multi_diff)
# models summary
library(texreg)
## Version: 1.39.4
## Date: 2024-07-23
## Author: Philip Leifeld (University of Manchester)
##
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
##
## Attaching package: 'texreg'
## The following object is masked from 'package:tidyr':
##
## extract
screenreg(list(Multi_model1, Multi_model2, Multi_model3, Multi_model4, Multi_model5))
##
## =====================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5
## ---------------------------------------------------------------------
## (Intercept) 0.24 *** 0.21 *** 0.25 *** 0.17 *** 0.20 ***
## (0.04) (0.05) (0.06) (0.04) (0.05)
## diff_Gdp -0.00 -0.00 -0.00
## (0.00) (0.00) (0.00)
## diff_Edu -0.01 -0.01 -0.02 -0.01 -0.02
## (0.02) (0.02) (0.02) (0.02) (0.02)
## diff_HealthExp 0.00 0.00 0.00 0.00
## (0.00) (0.00) (0.00) (0.00)
## diff_Urban -0.30 -0.20
## (0.25) (0.25)
## ---------------------------------------------------------------------
## R^2 0.05 0.08 0.11 0.03 0.04
## Adj. R^2 0.01 0.02 0.03 -0.01 -0.02
## Num. obs. 48 48 48 48 48
## =====================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
library(performance)
library(insight)
library(see)
# compute different indices of model fit
Modelresult <- compare_performance(Multi_model1, Multi_model2, Multi_model3, Multi_model4, Multi_model5)
print_md(Modelresult)
| Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | R2 | R2 (adj.) | RMSE | Sigma |
|---|---|---|---|---|---|---|---|---|
| Multi_model1 | lm | -16.4 (0.29) | -15.5 (0.35) | -8.9 (0.47) | 0.05 | 9.31e-03 | 0.19 | 0.19 |
| Multi_model2 | lm | -16.1 (0.24) | -14.6 (0.23) | -6.7 (0.15) | 0.08 | 0.02 | 0.18 | 0.19 |
| Multi_model3 | lm | -15.7 (0.20) | -13.6 (0.14) | -4.4 (0.05) | 0.11 | 0.03 | 0.18 | 0.19 |
| Multi_model4 | lm | -15.4 (0.17) | -14.4 (0.20) | -7.9 (0.28) | 0.03 | -0.01 | 0.19 | 0.20 |
| Multi_model5 | lm | -14.1 (0.09) | -12.6 (0.08) | -4.7 (0.06) | 0.04 | -0.02 | 0.19 | 0.20 |
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
summary(Multi_model3)
##
## Call:
## lm(formula = diff_LE ~ diff_Gdp + diff_Edu + diff_HealthExp +
## diff_Urban, data = df_multi_diff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.40460 -0.11137 0.01445 0.09820 0.46659
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.520e-01 5.938e-02 4.244 0.000115 ***
## diff_Gdp -1.003e-04 5.467e-05 -1.834 0.073549 .
## diff_Edu -1.811e-02 1.688e-02 -1.073 0.289132
## diff_HealthExp 5.236e-04 3.407e-04 1.537 0.131615
## diff_Urban -3.024e-01 2.488e-01 -1.215 0.230809
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1916 on 43 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1135, Adjusted R-squared: 0.031
## F-statistic: 1.376 on 4 and 43 DF, p-value: 0.2582