# Preview a few rows to understand the structure
head(Maddison_Economic_Data)
head(Fraiser_Data)
# Filter Maddison data to relevant time period and drop missing entries
Maddison_Economic_Data_Cleaned <- Maddison_Economic_Data %>%
filter(year >= 1970, year <= 2022) %>%
na.omit() %>%
mutate(year = as.integer(year))
# Select relevant columns from Fraser data and rename for clarity
Fraiser_Data_Cleaned <- Fraiser_Data[, c(1,3,4,5,8,20,32,41,59,82)]
colnames(Fraiser_Data_Cleaned) <- c("year", "ISO_Code_3", "country", "EFW_Index",
"Gov_Consumption", "Gov_Size", "Property_Rights",
"Sound_Money", "Freedom_to_Trade", "Regulation")
# Remove rows with missing values and ensure matching types for join
Fraiser_Data_Cleaned <- Fraiser_Data_Cleaned %>%
na.omit() %>%
mutate(year = as.integer(year))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `year = as.integer(year)`.
## Caused by warning:
## ! NAs introduced by coercion
# Combine datasets on shared keys: country and year
Combined_Economic_Data <- inner_join(
Maddison_Economic_Data_Cleaned,
Fraiser_Data_Cleaned,
by = c("country", "year")
) %>%
mutate(log_gdppc = log(gdppc)) # log-transform GDP per capita
# Get detailed summary statistics without visual charts
skim_without_charts(Combined_Economic_Data)
| Name | Combined_Economic_Data |
| Number of rows | 3316 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| countrycode | 0 | 1 | 3 | 3 | 0 | 133 | 0 |
| country | 0 | 1 | 4 | 24 | 0 | 133 | 0 |
| region | 0 | 1 | 9 | 28 | 0 | 8 | 0 |
| ISO_Code_3 | 0 | 1 | 3 | 3 | 0 | 133 | 0 |
| EFW_Index | 0 | 1 | 1 | 18 | 0 | 523 | 0 |
| Gov_Consumption | 0 | 1 | 1 | 21 | 0 | 2506 | 0 |
| Gov_Size | 0 | 1 | 4 | 18 | 0 | 3296 | 0 |
| Property_Rights | 0 | 1 | 15 | 18 | 0 | 3314 | 0 |
| Sound_Money | 0 | 1 | 1 | 21 | 0 | 3293 | 0 |
| Freedom_to_Trade | 0 | 1 | 1 | 19 | 0 | 3259 | 0 |
| Regulation | 0 | 1 | 3 | 18 | 0 | 3261 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1 | 2007.32 | 12.03 | 1970.00 | 2003.00 | 2010.00 | 2016.00 | 2022.00 |
| gdppc | 0 | 1 | 17182.89 | 18516.74 | 561.00 | 3500.97 | 10996.65 | 24969.04 | 160051.24 |
| pop | 0 | 1 | 48022.57 | 160258.01 | 91.00 | 4046.46 | 10106.57 | 29390.11 | 1402536.74 |
| log_gdppc | 0 | 1 | 9.14 | 1.22 | 6.33 | 8.16 | 9.31 | 10.13 | 11.98 |
# Basic descriptive stats for selected variables
Combined_Economic_Data %>%
select(year, country, region, gdppc, log_gdppc, EFW_Index, pop) %>%
summary()
## year country region gdppc
## Min. :1970 Length:3316 Length:3316 Min. : 561
## 1st Qu.:2003 Class :character Class :character 1st Qu.: 3501
## Median :2010 Mode :character Mode :character Median : 10997
## Mean :2007 Mean : 17183
## 3rd Qu.:2016 3rd Qu.: 24969
## Max. :2022 Max. :160051
## log_gdppc EFW_Index pop
## Min. : 6.330 Length:3316 Min. : 91
## 1st Qu.: 8.161 Class :character 1st Qu.: 4046
## Median : 9.305 Mode :character Median : 10107
## Mean : 9.140 Mean : 48023
## 3rd Qu.:10.125 3rd Qu.: 29390
## Max. :11.983 Max. :1402537
# Convert Fraser indicators to numeric to ensure compatibility in modeling
Combined_Economic_Data <- Combined_Economic_Data %>%
mutate(across(c(EFW_Index, Gov_Consumption, Gov_Size, Property_Rights,
Sound_Money, Freedom_to_Trade, Regulation), as.numeric))
# Create heatmap to visualize pairwise correlations between economic indicators
cor_vars <- Combined_Economic_Data %>%
select(log_gdppc, EFW_Index, Gov_Consumption, Gov_Size,
Property_Rights, Sound_Money, Freedom_to_Trade, Regulation)
cor_matrix <- cor(cor_vars, use = "complete.obs")
cor_long <- melt(cor_matrix)
# Plot heatmap with correlation values
ggplot(cor_long, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), color = "black", size = 3.5) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = "Correlation") +
theme_minimal() +
labs(title = "Correlation Heatmap of Economic Indicators") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
axis.title = element_blank())
# Visualize distribution of GDP per capita (log) across geographic regions
ggplot(Combined_Economic_Data, aes(x = region, y = log_gdppc, fill = region)) +
geom_boxplot() +
labs(title = "Log GDP per Capita by Region",
y = "Log GDP per Capita", x = "Region") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Generate scatterplots for each economic freedom variable against log GDP per capita
vars_to_plot <- c("EFW_Index", "Gov_Consumption", "Gov_Size", "Property_Rights",
"Sound_Money", "Freedom_to_Trade", "Regulation")
plots <- lapply(vars_to_plot, function(var) {
ggplot(Combined_Economic_Data, aes_string(x = var, y = "log_gdppc")) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_smooth(method = "lm", color = "darkred", se = FALSE) +
labs(title = paste(var, "vs Log GDP per Capita"),
x = var, y = "Log GDP per Capita") +
theme_minimal()
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Arrange scatterplots into a multi-panel layout
cowplot::plot_grid(plotlist = plots, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# 1) Make sure types are right and create the log
Combined_Economic_Data <- Combined_Economic_Data %>%
mutate(
region = factor(region),
year = as.integer(year),
gdppc = as.numeric(gdppc), # ensure numeric
log_gdppc = if (!"log_gdppc" %in% names(.)) log(gdppc) else log_gdppc
)
# 2) Compute lag and first difference of log(gdppc), *within country* and *ordered by year*
Combined_Economic_Data <- Combined_Economic_Data %>%
group_by(country) %>%
mutate(
lag_log_gdppc = dplyr::lag(log_gdppc, order_by = year),
d_log_gdppc = log_gdppc - lag_log_gdppc
) %>%
ungroup()
# 3) Select model variables and drop rows that are missing ONLY because of differencing or missing covariates
model_vars <- c(
"d_log_gdppc","lag_log_gdppc",
"Gov_Consumption","Gov_Size","Property_Rights",
"Sound_Money","Freedom_to_Trade","Regulation","region"
)
# sanity check: warn if any requested columns are absent
missing_cols <- setdiff(model_vars, names(Combined_Economic_Data))
if (length(missing_cols)) {
stop(sprintf("These columns are missing from Combined_Economic_Data: %s",
paste(missing_cols, collapse = ", ")))
}
model_data <- Combined_Economic_Data %>%
select(all_of(model_vars)) %>%
drop_na(d_log_gdppc, lag_log_gdppc) %>% # required by construction
drop_na(Gov_Consumption, Gov_Size, Property_Rights,
Sound_Money, Freedom_to_Trade, Regulation, region) %>%
droplevels()
# 4) Fit the model
model_diff <- lm(
d_log_gdppc ~ lag_log_gdppc + Gov_Consumption + Gov_Size +
Property_Rights + Sound_Money + Freedom_to_Trade +
Regulation + region,
data = model_data
)
summary(model_diff)
##
## Call:
## lm(formula = d_log_gdppc ~ lag_log_gdppc + Gov_Consumption +
## Gov_Size + Property_Rights + Sound_Money + Freedom_to_Trade +
## Regulation + region, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.51593 -0.02990 -0.00555 0.02126 0.97646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2387751 0.0256797 9.298 < 2e-16 ***
## lag_log_gdppc -0.0258917 0.0026410 -9.804 < 2e-16 ***
## Gov_Consumption 0.0007519 0.0010703 0.703 0.482385
## Gov_Size -0.0062771 0.0017731 -3.540 0.000406 ***
## Property_Rights 0.0067528 0.0017965 3.759 0.000174 ***
## Sound_Money -0.0050678 0.0011972 -4.233 2.37e-05 ***
## Freedom_to_Trade 0.0058385 0.0015629 3.736 0.000191 ***
## Regulation 0.0095325 0.0022169 4.300 1.76e-05 ***
## regionEastern Europe -0.0174305 0.0114727 -1.519 0.128786
## regionLatin America -0.0299045 0.0117459 -2.546 0.010945 *
## regionMiddle East and North Africa -0.0094531 0.0121065 -0.781 0.434961
## regionSouth and South East Asia -0.0042981 0.0121264 -0.354 0.723034
## regionSub Saharan Africa -0.0629882 0.0119281 -5.281 1.37e-07 ***
## regionWestern Europe -0.0246342 0.0113354 -2.173 0.029839 *
## regionWestern Offshoots -0.0379744 0.0134681 -2.820 0.004839 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07801 on 3168 degrees of freedom
## Multiple R-squared: 0.07164, Adjusted R-squared: 0.06753
## F-statistic: 17.46 on 14 and 3168 DF, p-value: < 2.2e-16
vif(model_diff)
## GVIF Df GVIF^(1/(2*Df))
## lag_log_gdppc 5.432823 1 2.330842
## Gov_Consumption 3.187520 1 1.785363
## Gov_Size 2.419598 1 1.555506
## Property_Rights 6.126253 1 2.475127
## Sound_Money 2.256757 1 1.502251
## Freedom_to_Trade 3.255542 1 1.804312
## Regulation 3.906374 1 1.976455
## region 8.655045 7 1.166669
# Plot diagnostics: residuals vs fitted, scale-location, and normal Q-Q
plot(model_diff, which = 1) # Residuals vs Fitted
plot(model_diff, which = 3) # Scale-Location
plot(model_diff, which = 2) # Normal Q-Q
# Residual distribution density vs normal distribution
residuals_df <- data.frame(resid = residuals(model_diff))
ggplot(residuals_df, aes(x = resid)) +
geom_density(fill = "lightblue", color = "black") +
stat_function(fun = dnorm,
args = list(mean = mean(residuals_df$resid),
sd = sd(residuals_df$resid)),
color = "red", linetype = "dashed") +
labs(title = "Density Plot of Model Residuals",
x = "Residuals", y = "Density") +
theme_minimal()
This analysis includes a lagged GDP per capita variable, which helps control for past economic conditions when modeling changes in economic performance. The inclusion of this lag term improves model interpretation by accounting for baseline economic levels.
The results indicate that government consumption continues to show a positive and statistically significant association with changes in log GDP per capita, even after controlling for the previous year’s GDP. Several regional effects remain significant, reinforcing the idea that geography and local context play a key role in economic development.
the R-squared is low (~6.8%), this is not unusual for macro-level panel data covering many countries. The low explanatory power reflects the complexity of economic growth, which is influenced by many unmeasured factors.
The variance inflation factors suggest some multicolinearity is present in the explanatory variables, but the problem is not severe.
Diagnostic plots suggest some non-normality and heteroscedasticity in the residuals, so caution is warranted when interpreting coefficients.
Future improvements could include: - Using fixed or random effects models for better control over unobserved heterogeneity - Lagging additional predictors to explore delayed policy effects - Employing robust standard errors to adjust for heteroscedasticity