R Markdown
Importing GHED Data
ghed_raw <- read_xlsx("GHED_data.xlsx", sheet = "Data") %>%
select(country, code, year,
gghed_usd, gghed_usd2021, gghed_usd2021_pc,
gdp_usd2021_pc, gghed_gdp, gge_gdp) %>%
mutate(est_type = "observed")
# View the resulting data frame
#print(ghed_raw)
## Add years to 2030 and convert to long format
temp <- expand.grid(unique(ghed_raw$code), seq(2000, 2030, 1)) %>%
rename("code" = "Var1", "year" = "Var2") %>%
left_join(ghed_raw, by = c("code", "year")) %>%
#convert to long format
pivot_longer(cols = starts_with("gghed"),
names_to = "ind_name", values_to = "ind_value")
ghed_long <- ghed_raw %>%
pivot_longer(cols = starts_with("gghed"),
names_to = "ind_name", values_to = "ind_value")
#Regression using Panel data for beginners
#print(ghed_long)
Exclude certain countries and years and take logs
ghed_filtered <- ghed_raw %>%
mutate(
log_gdp_pc = log(gdp_usd2021_pc),
log_gghed_pc = log(gghed_usd2021_pc),
log_gge_gdp = log(gge_gdp),
contract = ifelse((log_gdp_pc-dplyr::lag(log_gdp_pc))<0,1,0)
) %>%
filter(!(country %in% c("Cuba", "South Sudan", "Venezuela", "Yemen",
"Zimbabwe"))) %>%
filter(!(year %in% c("2020", "2021", "2022"))) %>%
na.omit() # Remove any rows with NA values
model1 <- plm(log_gghed_pc ~ log_gdp_pc, data = ghed_filtered,index =
c("country", "year"), model = "within")
model2 <- plm(log_gghed_pc ~ log_gdp_pc + contract +log_gdp_pc*contract,
data = ghed_filtered,index = c("country", "year"), model = "within")
model3 <- plm(log_gghed_pc ~ log_gdp_pc + contract + log_gdp_pc*contract +
log_gge_gdp + log_gge_gdp*contract,data = ghed_filtered,
index = c("country", "year"), model = "within")
# Use stargazer to display the model summary
stargazer(model1, model2, model3, type="text",align = TRUE)
##
## ========================================================================================================
## Dependent variable:
## -----------------------------------------------------------------------------------
## log_gghed_pc
## (1) (2) (3)
## --------------------------------------------------------------------------------------------------------
## log_gdp_pc 1.269*** 1.276*** 1.161***
## (0.022) (0.022) (0.020)
##
## contract -0.134** -0.064
## (0.057) (0.066)
##
## log_gge_gdp 0.617***
## (0.023)
##
## log_gdp_pc:contract 0.021*** -0.005
## (0.006) (0.006)
##
## contract:log_gge_gdp 0.041**
## (0.019)
##
## --------------------------------------------------------------------------------------------------------
## Observations 3,698 3,698 3,698
## R2 0.493 0.498 0.593
## Adjusted R2 0.466 0.471 0.570
## F Statistic 3,415.132*** (df = 1; 3509) 1,159.300*** (df = 3; 3507) 1,019.727*** (df = 5; 3505)
## ========================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Make Predictions for 2020-2022
# Prepare the new data for the years 2020, 2021, and 2022
ghed_new_data <- ghed_raw %>%
mutate(
log_gdp_pc = log(gdp_usd2021_pc),
log_gghed_pc = log(gghed_usd2021_pc),
log_gge_gdp = log(gge_gdp),
contract = ifelse((log_gdp_pc - dplyr::lag(log_gdp_pc)) < 0, 1, 0)
) %>%
filter(country %in% c("South Africa", "Kenya", "Namibia")) %>%
filter(year %in% c("2020", "2021", "2022")) %>%
na.omit() # Remove any rows with NA values
# Make predictions with all three models
predictions_model1 <- predict(model1, newdata = ghed_new_data)
## Warning in predict.plm(model1, newdata = ghed_new_data): Data supplied in
## argument 'newdata' is not a pdata.frame; weighted mean of fixed effects as in
## original model used for prediction, see ?predict.plm.
predictions_model2 <- predict(model2, newdata = ghed_new_data)
## Warning in predict.plm(model2, newdata = ghed_new_data): Data supplied in
## argument 'newdata' is not a pdata.frame; weighted mean of fixed effects as in
## original model used for prediction, see ?predict.plm.
predictions_model3 <- predict(model3, newdata = ghed_new_data)
## Warning in predict.plm(model3, newdata = ghed_new_data): Data supplied in
## argument 'newdata' is not a pdata.frame; weighted mean of fixed effects as in
## original model used for prediction, see ?predict.plm.
# Combine predictions with ghed_new_data for easy interpretation
results <- ghed_new_data %>%
mutate(pred_model1 = predictions_model1,
pred_model2 = predictions_model2,
pred_model3 = predictions_model3)
# Visualize the results
ggplot(results, aes(x = as.factor(year), group = country)) +
geom_line(aes(y = log_gghed_pc, color = "Actual")) +
geom_line(aes(y = pred_model1, color = "Model 1 Prediction"), linetype = "dashed") +
geom_line(aes(y = pred_model2, color = "Model 2 Prediction"), linetype = "dotted") +
geom_line(aes(y = pred_model3, color = "Model 3 Prediction"), linetype = "twodash") +
facet_wrap(~country, scales = "free_y") +
labs(title = "Predictions vs Actual Log GGHE per Capita",
x = "Year",
y = "Log GGHE per Capita",
color = "Legend") +
theme_minimal()

Make predictions for South Africa and plot against orginal
# Create new data for South Africa for the years 2020-2029
sa_data <- data.frame(log_gdp_pc = rep(mean(ghed_filtered$log_gdp_pc), 10),
year = 2020:2029,
contract = 0,
log_gge_gdp = rep(mean(ghed_filtered$log_gge_gdp), 10))
# Make predictions using each model
sa_data$model1_pred <- predict(model1, newdata = sa_data)
## Warning in predict.plm(model1, newdata = sa_data): Data supplied in argument
## 'newdata' is not a pdata.frame; weighted mean of fixed effects as in original
## model used for prediction, see ?predict.plm.
sa_data$model2_pred <- predict(model2, newdata = sa_data)
## Warning in predict.plm(model2, newdata = sa_data): Data supplied in argument
## 'newdata' is not a pdata.frame; weighted mean of fixed effects as in original
## model used for prediction, see ?predict.plm.
sa_data$model3_pred <- predict(model3, newdata = sa_data)
## Warning in predict.plm(model3, newdata = sa_data): Data supplied in argument
## 'newdata' is not a pdata.frame; weighted mean of fixed effects as in original
## model used for prediction, see ?predict.plm.
# Plotting
sa_original <- ghed_filtered %>% filter(country == "South Africa")
ggplot() +
geom_line(data = sa_original, aes(x = year, y = exp(log_gghed_pc), color = "Original Data"), linewidth = 1) +
geom_line(data = sa_data, aes(x = year, y = exp(model1_pred), color = "Model 1 Prediction"), linetype = "dashed", linewidth = 1) +
geom_line(data = sa_data, aes(x = year, y = exp(model2_pred), color = "Model 2 Prediction"), linetype = "dashed", linewidth = 1) +
geom_line(data = sa_data, aes(x = year, y = exp(model3_pred), color = "Model 3 Prediction"), linetype = "dashed", linewidth = 1) +
labs(title = "Comparison of Predictions for South Africa (2020-2029)",
x = "Year",
y = "GGHED per capita",
color = "Data/Model") +
scale_color_manual(values = c("Original Data" = "blue", "Model 1 Prediction" = "green", "Model 2 Prediction" = "red", "Model 3 Prediction" = "purple")) +
theme_minimal()
