1. Load Data

# Preview a few rows to understand the structure
head(Maddison_Economic_Data)
head(Fraiser_Data)

2. Clean and Combine Data

# Filter Maddison data to relevant time period and drop missing entries
Maddison_Economic_Data_Cleaned <- Maddison_Economic_Data %>% 
  filter(year >= 1970, year <= 2022) %>% 
  na.omit() %>%
  mutate(year = as.integer(year))

# Select relevant columns from Fraser data and rename for clarity
Fraiser_Data_Cleaned <- Fraiser_Data[, c(1,3,4,5,8,20,32,41,59,82)]
colnames(Fraiser_Data_Cleaned) <- c("year", "ISO_Code_3", "country", "EFW_Index",
  "Gov_Consumption", "Gov_Size", "Property_Rights",
  "Sound_Money", "Freedom_to_Trade", "Regulation")

# Remove rows with missing values and ensure matching types for join
Fraiser_Data_Cleaned <- Fraiser_Data_Cleaned %>% 
  na.omit() %>%
  mutate(year = as.integer(year))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `year = as.integer(year)`.
## Caused by warning:
## ! NAs introduced by coercion
# Combine datasets on shared keys: country and year
Combined_Economic_Data <- inner_join(
  Maddison_Economic_Data_Cleaned,
  Fraiser_Data_Cleaned,
  by = c("country", "year")
) %>% 
  mutate(log_gdppc = log(gdppc))  # log-transform GDP per capita

3. Exploratory Analysis

# Get detailed summary statistics without visual charts
skim_without_charts(Combined_Economic_Data)
Data summary
Name Combined_Economic_Data
Number of rows 3316
Number of columns 15
_______________________
Column type frequency:
character 11
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
countrycode 0 1 3 3 0 133 0
country 0 1 4 24 0 133 0
region 0 1 9 28 0 8 0
ISO_Code_3 0 1 3 3 0 133 0
EFW_Index 0 1 1 18 0 523 0
Gov_Consumption 0 1 1 21 0 2506 0
Gov_Size 0 1 4 18 0 3296 0
Property_Rights 0 1 15 18 0 3314 0
Sound_Money 0 1 1 21 0 3293 0
Freedom_to_Trade 0 1 1 19 0 3259 0
Regulation 0 1 3 18 0 3261 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
year 0 1 2007.32 12.03 1970.00 2003.00 2010.00 2016.00 2022.00
gdppc 0 1 17182.89 18516.74 561.00 3500.97 10996.65 24969.04 160051.24
pop 0 1 48022.57 160258.01 91.00 4046.46 10106.57 29390.11 1402536.74
log_gdppc 0 1 9.14 1.22 6.33 8.16 9.31 10.13 11.98
# Basic descriptive stats for selected variables
Combined_Economic_Data %>%
  select(year, country, region, gdppc, log_gdppc, EFW_Index, pop) %>%
  summary()
##       year        country             region              gdppc       
##  Min.   :1970   Length:3316        Length:3316        Min.   :   561  
##  1st Qu.:2003   Class :character   Class :character   1st Qu.:  3501  
##  Median :2010   Mode  :character   Mode  :character   Median : 10997  
##  Mean   :2007                                         Mean   : 17183  
##  3rd Qu.:2016                                         3rd Qu.: 24969  
##  Max.   :2022                                         Max.   :160051  
##    log_gdppc       EFW_Index              pop         
##  Min.   : 6.330   Length:3316        Min.   :     91  
##  1st Qu.: 8.161   Class :character   1st Qu.:   4046  
##  Median : 9.305   Mode  :character   Median :  10107  
##  Mean   : 9.140                      Mean   :  48023  
##  3rd Qu.:10.125                      3rd Qu.:  29390  
##  Max.   :11.983                      Max.   :1402537
# Convert Fraser indicators to numeric to ensure compatibility in modeling
Combined_Economic_Data <- Combined_Economic_Data %>%
  mutate(across(c(EFW_Index, Gov_Consumption, Gov_Size, Property_Rights, 
                  Sound_Money, Freedom_to_Trade, Regulation), as.numeric))

Correlation Heatmap

# Create heatmap to visualize pairwise correlations between economic indicators
cor_vars <- Combined_Economic_Data %>%
  select(log_gdppc, EFW_Index, Gov_Consumption, Gov_Size,
         Property_Rights, Sound_Money, Freedom_to_Trade, Regulation)
cor_matrix <- cor(cor_vars, use = "complete.obs")
cor_long <- melt(cor_matrix)

# Plot heatmap with correlation values
ggplot(cor_long, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3.5) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name = "Correlation") +
  theme_minimal() +
  labs(title = "Correlation Heatmap of Economic Indicators") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
        axis.title = element_blank())

Boxplot by Region

# Visualize distribution of GDP per capita (log) across geographic regions
ggplot(Combined_Economic_Data, aes(x = region, y = log_gdppc, fill = region)) +
  geom_boxplot() +
  labs(title = "Log GDP per Capita by Region",
       y = "Log GDP per Capita", x = "Region") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Scatterplots with Linear Trend

# Generate scatterplots for each economic freedom variable against log GDP per capita
vars_to_plot <- c("EFW_Index", "Gov_Consumption", "Gov_Size", "Property_Rights",
                  "Sound_Money", "Freedom_to_Trade", "Regulation")

plots <- lapply(vars_to_plot, function(var) {
  ggplot(Combined_Economic_Data, aes_string(x = var, y = "log_gdppc")) +
    geom_point(alpha = 0.5, color = "steelblue") +
    geom_smooth(method = "lm", color = "darkred", se = FALSE) +
    labs(title = paste(var, "vs Log GDP per Capita"),
         x = var, y = "Log GDP per Capita") +
    theme_minimal()
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Arrange scatterplots into a multi-panel layout
cowplot::plot_grid(plotlist = plots, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

4. Model: First Differenced Linear Regression

# 1) Make sure types are right and create the log
Combined_Economic_Data <- Combined_Economic_Data %>%
  mutate(
    region  = factor(region),
    year    = as.integer(year),
    gdppc   = as.numeric(gdppc),              # ensure numeric
    log_gdppc = if (!"log_gdppc" %in% names(.)) log(gdppc) else log_gdppc
  )

# 2) Compute lag and first difference of log(gdppc), *within country* and *ordered by year*
Combined_Economic_Data <- Combined_Economic_Data %>%
  group_by(country) %>%
  mutate(
    lag_log_gdppc = dplyr::lag(log_gdppc, order_by = year),
    d_log_gdppc   = log_gdppc - lag_log_gdppc
  ) %>%
  ungroup()

# 3) Select model variables and drop rows that are missing ONLY because of differencing or missing covariates
model_vars <- c(
  "d_log_gdppc","lag_log_gdppc",
  "Gov_Consumption","Gov_Size","Property_Rights",
  "Sound_Money","Freedom_to_Trade","Regulation","region"
)

# sanity check: warn if any requested columns are absent
missing_cols <- setdiff(model_vars, names(Combined_Economic_Data))
if (length(missing_cols)) {
  stop(sprintf("These columns are missing from Combined_Economic_Data: %s",
               paste(missing_cols, collapse = ", ")))
}

model_data <- Combined_Economic_Data %>%
  select(all_of(model_vars)) %>%
  drop_na(d_log_gdppc, lag_log_gdppc) %>%  # required by construction
  drop_na(Gov_Consumption, Gov_Size, Property_Rights,
          Sound_Money, Freedom_to_Trade, Regulation, region) %>%
  droplevels()

# 4) Fit the model
model_diff <- lm(
  d_log_gdppc ~ lag_log_gdppc + Gov_Consumption + Gov_Size +
    Property_Rights + Sound_Money + Freedom_to_Trade +
    Regulation + region,
  data = model_data
)

summary(model_diff)
## 
## Call:
## lm(formula = d_log_gdppc ~ lag_log_gdppc + Gov_Consumption + 
##     Gov_Size + Property_Rights + Sound_Money + Freedom_to_Trade + 
##     Regulation + region, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.51593 -0.02990 -0.00555  0.02126  0.97646 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         0.2387751  0.0256797   9.298  < 2e-16 ***
## lag_log_gdppc                      -0.0258917  0.0026410  -9.804  < 2e-16 ***
## Gov_Consumption                     0.0007519  0.0010703   0.703 0.482385    
## Gov_Size                           -0.0062771  0.0017731  -3.540 0.000406 ***
## Property_Rights                     0.0067528  0.0017965   3.759 0.000174 ***
## Sound_Money                        -0.0050678  0.0011972  -4.233 2.37e-05 ***
## Freedom_to_Trade                    0.0058385  0.0015629   3.736 0.000191 ***
## Regulation                          0.0095325  0.0022169   4.300 1.76e-05 ***
## regionEastern Europe               -0.0174305  0.0114727  -1.519 0.128786    
## regionLatin America                -0.0299045  0.0117459  -2.546 0.010945 *  
## regionMiddle East and North Africa -0.0094531  0.0121065  -0.781 0.434961    
## regionSouth and South East Asia    -0.0042981  0.0121264  -0.354 0.723034    
## regionSub Saharan Africa           -0.0629882  0.0119281  -5.281 1.37e-07 ***
## regionWestern Europe               -0.0246342  0.0113354  -2.173 0.029839 *  
## regionWestern Offshoots            -0.0379744  0.0134681  -2.820 0.004839 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07801 on 3168 degrees of freedom
## Multiple R-squared:  0.07164,    Adjusted R-squared:  0.06753 
## F-statistic: 17.46 on 14 and 3168 DF,  p-value: < 2.2e-16
vif(model_diff)
##                      GVIF Df GVIF^(1/(2*Df))
## lag_log_gdppc    5.432823  1        2.330842
## Gov_Consumption  3.187520  1        1.785363
## Gov_Size         2.419598  1        1.555506
## Property_Rights  6.126253  1        2.475127
## Sound_Money      2.256757  1        1.502251
## Freedom_to_Trade 3.255542  1        1.804312
## Regulation       3.906374  1        1.976455
## region           8.655045  7        1.166669

5. Residual Analysis

# Plot diagnostics: residuals vs fitted, scale-location, and normal Q-Q
plot(model_diff, which = 1)  # Residuals vs Fitted

plot(model_diff, which = 3)  # Scale-Location

plot(model_diff, which = 2)  # Normal Q-Q

# Residual distribution density vs normal distribution
residuals_df <- data.frame(resid = residuals(model_diff))

ggplot(residuals_df, aes(x = resid)) +
  geom_density(fill = "lightblue", color = "black") +
  stat_function(fun = dnorm,
                args = list(mean = mean(residuals_df$resid), 
                            sd = sd(residuals_df$resid)),
                color = "red", linetype = "dashed") +
  labs(title = "Density Plot of Model Residuals",
       x = "Residuals", y = "Density") +
  theme_minimal()

6. Conclusion

This analysis includes a lagged GDP per capita variable, which helps control for past economic conditions when modeling changes in economic performance. The inclusion of this lag term improves model interpretation by accounting for baseline economic levels.

The results indicate that government consumption continues to show a positive and statistically significant association with changes in log GDP per capita, even after controlling for the previous year’s GDP. Several regional effects remain significant, reinforcing the idea that geography and local context play a key role in economic development.

the R-squared is low (~6.8%), this is not unusual for macro-level panel data covering many countries. The low explanatory power reflects the complexity of economic growth, which is influenced by many unmeasured factors.

The variance inflation factors suggest some multicolinearity is present in the explanatory variables, but the problem is not severe.

Diagnostic plots suggest some non-normality and heteroscedasticity in the residuals, so caution is warranted when interpreting coefficients.

Future improvements could include: - Using fixed or random effects models for better control over unobserved heterogeneity - Lagging additional predictors to explore delayed policy effects - Employing robust standard errors to adjust for heteroscedasticity