library(readxl)
library(writexl)
library(tidyverse)
library(labelled)
library(ggcorrplot)
library(gridExtra)
library(GGally)
library(knitr)
library(kableExtra)
require(psych)
library(GPArotation)
library(Hmisc)
library(visdat)
require(lavaan)
require(lavaanPlot)
require(ResourceSelection)
require(semPlot)
require(tidySEM)
require(performance)
require(car)
library(semPlot)From Equal Weights to Empirical Weights: Reassessing Economic Freedom Rankings
1 Paper’s Angle
Why equal weighting might be problematic:
The areas aren’t necessarily of equal importance to economic outcomes or freedom. For instance, having sound property rights and rule of law (Area 2) could be argued as more fundamental than the exact size of government (Area 1). A country could theoretically have a larger government that still respects contracts and property rights. Also, the components within areas vary significantly in their impact. Sound Money has only 4 components while Regulation has many more nested subcomponents, yet each area still gets 20%. Third, different development contexts might require different weightings. What matters most for economic freedom in a developing economy might differ from an advanced economy. We do not address this issue though.
However, there are counterarguments for equal weighting:
It avoids subjective judgments about what’s “more important,” which could introduce ideological bias. Equal weighting is at least transparent and defensible as a neutral starting point. Empirically, one could test whether different weighting schemes would materially change country rankings or correlations with outcomes. If the rankings are robust to different weighting approaches, then the equal weighting critique may be less concerning in practice. The simplicity of equal weights can also make the index more accessible and easier to understand for policymakers and the public
Terminology used in this notebook follows the EFW hierarchy: Area, Component, and Subcomponent. When we refer to variables used in EFA/CFA, we report names and codes (for example, X1A, X1D_i).
2 Setup
2.1 Libraries
Minimum set of libraries
2.2 Functions
Correlation, printing, and citing
corplot <- function(d, m = "pearson", sz = 2.5, title = NULL, hc_order = FALSE) {
d_num <- d %>% dplyr::select(where(is.numeric))
if (ncol(d_num) < 2) return(NULL)
mycorr <- cor(d_num, use = "pairwise.complete.obs", method = m)
p.mat <- ggcorrplot::cor_pmat(d_num)
ggcorrplot(mycorr,
hc.order = hc_order,
type = "lower",
colors = c("red","white","green"),
tl.cex = 8,
tl.col = "black",
lab = TRUE,
lab_size = sz,
p.mat = p.mat,
sig.level = 0.01,
insig = "pch",
pch = 4) +
ggplot2::ggtitle(title %||% "")
}
myprint=function(x){x%>%kbl()%>%kable_classic(html_font = "Cambria")}2.3 Variable Labels (Paper Outputs)
# Single source of truth for Component/Subcomponent labels used in tables/figures/exports.
var_label_map <- c(
X1A = "Government consumption",
X1B = "Transfers and subsidies",
X1C = "Government investment",
X1D_i = "Top marginal income tax rate",
X1D_ii = "Top marginal income and payroll tax rate",
X1E = "State ownership of assets",
X2A_scaled = "Judicial independence*",
X2B_scaled = "Impartial courts*",
X2C_scaled = "Property rights*",
X2D_scaled = "Military interference*",
X2E_scaled = "Legal integrity*",
X2F_scaled = "Contract enforcement*",
X2G_scaled = "Real property restrictions*",
X2H_scaled = "Police reliability*",
X3A = "Money growth",
X3B = "Inflation volatility",
X3C = "Recent inflation",
X3D = "Foreign currency accounts",
X4A_i = "Trade tax revenue",
X4A_ii = "Mean tariff rate",
X4A_iii = "Tariff dispersion",
X4B_i = "Non-tariff barriers",
X4B_ii = "Trade compliance costs",
X4C = "Black market exchange rates",
X4D_i = "Financial openness",
X4D_ii = "Capital controls",
X4D_iii = "Freedom of foreigners to visit",
X4D_iv = "Protection of foreign assets",
X5A_i = "Ownership of banks",
X5A_ii = "Private sector credit",
X5A_iii = "Interest rate controls",
X5B_i = "Hiring regulations and minimum wage",
X5B_ii = "Hiring and firing regulations",
X5B_iii = "Centralized collective bargaining",
X5B_iv = "Hours regulations",
X5B_v = "Mandated cost of worker dismissal",
X5B_vi = "Conscription",
X5B_vii = "Foreign labor",
X5C_i = "Regulatory burden",
X5C_ii = "Bureaucracy costs",
X5C_iii = "Impartial public administration",
X5C_iv = "Tax compliance",
X5D_i = "Market openness",
X5D_ii = "Business permits",
X5D_iii = "Distortion of business environment"
)3 Data
Will have to fix the imported data, relabel vars, add value labels, and export the cleaned data out. Will filter to 2022 data, though many more years of data is available. 2023 data just got released.
# Import Data
raw_data <- read_excel("../data/raw/economic-freedom-of-the-world-2024-master-index-data-for-researchers-iso.xlsx",
sheet = "EFW Data 2024 Report",
range = "B5:CH4790"
)New names:
• `data` -> `data...9`
• `data` -> `data...11`
• `data` -> `data...13`
• `data` -> `data...15`
• `data` -> `data...17`
• `data` -> `data...35`
• `data` -> `data...37`
• `data` -> `data...39`
• `data` -> `data...44`
• `data` -> `data...46`
• `data` -> `data...48`
df <- raw_data
#names(df)
#str(df)4 Data Cleaning: Fix imported var names and add Labels
4.1 Area 1: Size of Government
1A: Government consumption
1B: Transfers and subsidies
1C: Government investment
1D: Top marginal tax rate
i. 1Di: Top marginal income tax rate
ii. 1Dii: Top marginal income and payroll tax rates
1E: State ownership of assets
# --- Rename only Area 1 columns ---
df <- df %>%
rename(
`1A` = `Government consumption`, # score
`1A_data` = `data...9`, # underlying data
`1B` = `Transfers and subsidies`, # score
`1B_data` = `data...11`, # underlying data
`1C` = `Government investment`, # score
`1C_data` = `data...13`, # underlying data
`1D_i` = `Top marginal income tax rate`, # score
`1D_i_data` = `data...15`, # underlying data
`1D_ii` = `Top marginal income and payroll tax rate`, # score
`1D_ii_data` = `data...17`, # underlying data
`1D` = `Top marginal tax rate`, # combined top rate
`1E` = `State ownership of Assets`, # score
`Area1` = `Size of Government`, # Area 1 summary score
`Area1_rank` = `Area 1 Rank` # Area 1 rank
)
# --- Attach labels so you keep the readable names ---
var_label(df$`1A`) <- "Government consumption"
var_label(df$`1A_data`) <- "Government consumption – data"
var_label(df$`1B`) <- "Transfers and subsidies"
var_label(df$`1B_data`) <- "Transfers and subsidies – data"
var_label(df$`1C`) <- "Government investment"
var_label(df$`1C_data`) <- "Government investment – data"
var_label(df$`1D_i`) <- "Top marginal income tax rate"
var_label(df$`1D_i_data`) <- "Top marginal income tax rate – data"
var_label(df$`1D_ii`) <- "Top marginal income and payroll tax rate"
var_label(df$`1D_ii_data`) <- "Top marginal income and payroll tax rate – data"
var_label(df$`1D`) <- "Top marginal tax rate (aggregate score)"
var_label(df$`1E`) <- "State ownership of Assets"
var_label(df$Area1) <- "Area 1: Size of Government (summary score)"
var_label(df$Area1_rank) <- "Area 1 Rank"
#dplyr::glimpse(df) # see new names
#sapply(df[c("1A","1B","1C","1D","1E","Area1")], var_label) # see labels4.2 Area 2: Legal System and Property Rights
2A–2H cover the eight Subcomponents.
2_adj, 2_wo_gen, 2_with_gen capture the gender adjustment and the two alternate summary indices.
Area2 = the official “with gender adjustment” score.
df <- df %>%
rename(
`2A` = `Judicial independence`,
`2B` = `Impartial courts`,
`2C` = `Protection of property rights`,
`2D` = `Military interference in rule of law and politics`,
`2E` = `Integrity of the legal system`,
`2F` = `Legal enforcement of contracts`,
`2G` = `Regulatory restrictions on the sale of real property`,
`2H` = `Reliability of police`,
`2_adj` = `Gender Legal Rights Adjustment`,
`2_wo_gen` = `Legal System & Property Rights -- Without Gender Adjustment`,
`2_with_gen` = `Legal System & Property Rights -- With Gender Adjustment`,
Area2_rank = `Area 2 Rank`
) %>%
mutate(
Area2 = `2_with_gen`
) %>%
relocate(Area2, .after = `2_with_gen`)
# Labels
var_label(df$`2A`) <- "Judicial independence"
var_label(df$`2B`) <- "Impartial courts"
var_label(df$`2C`) <- "Protection of property rights"
var_label(df$`2D`) <- "Military interference in rule of law and politics"
var_label(df$`2E`) <- "Integrity of the legal system"
var_label(df$`2F`) <- "Legal enforcement of contracts"
var_label(df$`2G`) <- "Regulatory restrictions on the sale of real property"
var_label(df$`2H`) <- "Reliability of police"
var_label(df$`2_adj`) <- "Gender Legal Rights Adjustment"
var_label(df$`2_wo_gen`) <- "Legal System & Property Rights (without Gender Adjustment)"
var_label(df$`2_with_gen`) <- "Legal System & Property Rights (with Gender Adjustment)"
var_label(df$Area2) <- "Area 2: Legal System & Property Rights (summary score)"
var_label(df$Area2_rank) <- "Area 2: Rank"4.3 Create the Scaled Variables for Area 2
#str(df)
# Compute scaling factor
scaling_factor <- (1 + df$`2_adj`) / 2
# Create new scaled variables and assign descriptive names
df$`2A_scaled` <- df$`2A` * scaling_factor
label(df$`2A_scaled`) <- "Judicial independence*"
df$`2B_scaled` <- df$`2B` * scaling_factor
label(df$`2B_scaled`) <- "Impartial courts*"
df$`2C_scaled` <- df$`2C` * scaling_factor
label(df$`2C_scaled`) <- "Protection of property rights*"
df$`2D_scaled` <- df$`2D` * scaling_factor
label(df$`2D_scaled`) <- "Military interference in rule of law and politics*"
df$`2E_scaled` <- df$`2E` * scaling_factor
label(df$`2E_scaled`) <- "Integrity of the legal system *"
df$`2F_scaled` <- df$`2F` * scaling_factor
label(df$`2F_scaled`) <- "Legal enforcement of contracts*"
df$`2G_scaled` <- df$`2G` * scaling_factor
label(df$`2G_scaled`) <- "Regulatory restrictions on property*"
df$`2H_scaled` <- df$`2H` * scaling_factor
label(df$`2H_scaled`) <- "Reliability of police*"4.4 Area 3: Sound Money
df <- df %>%
rename(
`3A` = `Money growth`,
`3A_data` = `data...35`,
`3B` = `Standard deviation of inflation`,
`3B_data` = `data...37`,
`3C` = `Inflation: Most recent year`,
`3C_data` = `data...39`,
`3D` = `Freedom to own foreign currency bank accounts`,
Area3 = `Sound Money`,
Area3_rank = `Area 3 Rank`
)
var_label(df$`3A`) <- "Money growth"
var_label(df$`3A_data`) <- "Money growth – data"
var_label(df$`3B`) <- "Standard deviation of inflation"
var_label(df$`3B_data`) <- "Standard deviation of inflation – data"
var_label(df$`3C`) <- "Inflation: Most recent year"
var_label(df$`3C_data`) <- "Inflation: Most recent year – data"
var_label(df$`3D`) <- "Freedom to own foreign currency bank accounts"
var_label(df$Area3) <- "Area 3: Sound Money (summary score)"
var_label(df$Area3_rank) <- "Area 3: Rank"4.5 Area 4: Freedom to Trade Internationally
df <- df %>%
rename(
`4A_i` = `Revenue from trade taxes (% of trade sector)`,
`4A_i_data` = `data...44`,
`4A_ii` = `Mean tariff rate`,
`4A_ii_data`= `data...46`,
`4A_iii` = `Standard deviation of tariff rates`,
`4A_iii_data`= `data...48`,
`4A` = `Tariffs`,
`4B_i` = `Non-tariff trade barriers`,
`4B_ii` = `Compliance costs of importing and exporting`,
`4B` = `Regulatory trade barriers`,
`4C` = `Black market exchange rates`,
`4D_i` = `Financial openness`,
`4D_ii` = `Capital controls`,
`4D_iii` = `Freedom of foreigners to visit`,
`4D_iv` = `Protection of foreign assets`,
`4D` = `Controls of the movement of capital and people`,
Area4 = `Freedom to trade internationally`,
Area4_rank = `Area 4 Rank`
)
var_label(df$`4A_i`) <- "Revenue from trade taxes (% of trade sector)"
var_label(df$`4A_i_data`) <- "Revenue from trade taxes – data"
var_label(df$`4A_ii`) <- "Mean tariff rate"
var_label(df$`4A_ii_data`)<- "Mean tariff rate – data"
var_label(df$`4A_iii`) <- "Standard deviation of tariff rates"
var_label(df$`4A_iii_data`)<- "Standard deviation of tariff rates – data"
var_label(df$`4A`) <- "Tariffs (aggregate score)"
var_label(df$`4B_i`) <- "Non-tariff trade barriers"
var_label(df$`4B_ii`) <- "Compliance costs of importing and exporting"
var_label(df$`4B`) <- "Regulatory trade barriers (aggregate score)"
var_label(df$`4C`) <- "Black market exchange rates"
var_label(df$`4D_i`) <- "Financial openness"
var_label(df$`4D_ii`) <- "Capital controls"
var_label(df$`4D_iii`) <- "Freedom of foreigners to visit"
var_label(df$`4D_iv`) <- "Protection of foreign assets"
var_label(df$`4D`) <- "Controls of the movement of capital and people (aggregate score)"
var_label(df$Area4) <- "Area 4: Freedom to trade internationally (summary score)"
var_label(df$Area4_rank) <- "Area 4: Rank"4.6 Area 5: Regulation
df <- df %>%
rename(
`5A_i` = `Ownership of banks`,
`5A_ii` = `Private sector credit`,
`5A_iii` = `Interest rate controls/negative real interest rates)`,
`5A` = `Credit market regulations`,
`5B_i` = `Hiring regulations and minimum wage`,
`5B_ii` = `Hiring and firing regulations`,
`5B_iii` = `Centralized collective bargaining`,
`5B_iv` = `Hours Regulations`,
`5B_v` = `Mandated cost of worker dismissal`,
`5B_vi` = `Conscription`,
`5B_vii` = `Foreign Labor`,
`5B` = `Labor market regulations`,
`5C_i` = `Regulatory Burden`,
`5C_ii` = `Bureacracy costs`,
`5C_iii` = `Impartial Public Administration`,
`5C_iv` = `Tax compliance`,
`5C` = `Business regulations`,
`5D_i` = `Market openness`,
`5D_ii` = `Business Permits`,
`5D_iii` = `Distorton of the business environment`,
`5D` = `Freedom to enter markets and compete`,
Area5 = `Regulation`,
Area5_rank = `Area 5 Rank`
)
var_label(df$`5A_i`) <- "Ownership of banks"
var_label(df$`5A_ii`) <- "Private sector credit"
var_label(df$`5A_iii`) <- "Interest rate controls / negative real interest rates"
var_label(df$`5A`) <- "Credit market regulations (aggregate score)"
var_label(df$`5B_i`) <- "Hiring regulations and minimum wage"
var_label(df$`5B_ii`) <- "Hiring and firing regulations"
var_label(df$`5B_iii`) <- "Centralized collective bargaining"
var_label(df$`5B_iv`) <- "Hours Regulations"
var_label(df$`5B_v`) <- "Mandated cost of worker dismissal"
var_label(df$`5B_vi`) <- "Conscription"
var_label(df$`5B_vii`) <- "Foreign Labor"
var_label(df$`5B`) <- "Labor market regulations (aggregate score)"
var_label(df$`5C_i`) <- "Regulatory Burden"
var_label(df$`5C_ii`) <- "Bureaucracy costs"
var_label(df$`5C_iii`) <- "Impartial Public Administration"
var_label(df$`5C_iv`) <- "Tax compliance"
var_label(df$`5C`) <- "Business regulations (aggregate score)"
var_label(df$`5D_i`) <- "Market openness"
var_label(df$`5D_ii`) <- "Business Permits"
var_label(df$`5D_iii`) <- "Distortion of the business environment"
var_label(df$`5D`) <- "Freedom to enter markets and compete (aggregate score)"
var_label(df$Area5) <- "Area 5: Regulation (summary score)"
var_label(df$Area5_rank) <- "Area 5: Rank"5 Export 2022 Clean Data
df_2022 <- df %>%
filter(Year == 2022) %>%
select(
Year, Countries, `Economic Freedom Summary Index`, Rank,
# Area 1
`1A`, `1B`, `1C`, `1D_i`, `1D_ii`, `1D`, `1E`, Area1, Area1_rank,
# Area 2
`2A`, `2B`, `2C`, `2D`, `2E`, `2F`, `2G`, `2H`, `2_adj`, `2_wo_gen`, `2_with_gen`, Area2, Area2_rank,
# Scaled variables from Area 2
`2A_scaled`, `2B_scaled`, `2C_scaled`, `2D_scaled`, `2E_scaled`, `2F_scaled`, `2G_scaled`, `2H_scaled`,
# Area 3
`3A`, `3B`, `3C`, `3D`, Area3, Area3_rank,
# Area 4
`4A_i`, `4A_ii`, `4A_iii`, `4A`, `4B_i`, `4B_ii`, `4B`, `4C`,
`4D_i`, `4D_ii`, `4D_iii`, `4D_iv`, `4D`, Area4, Area4_rank,
# Area 5
`5A_i`, `5A_ii`, `5A_iii`, `5A`,
`5B_i`, `5B_ii`, `5B_iii`, `5B_iv`, `5B_v`, `5B_vi`, `5B_vii`, `5B`,
`5C_i`, `5C_ii`, `5C_iii`, `5C_iv`, `5C`,
`5D_i`, `5D_ii`, `5D_iii`, `5D`,
Area5, Area5_rank
)
# myprint(describe(mydata))vis_dat(df_2022)# export
write_xlsx(x = df_2022,
path = "../data/clean/economic_freedom_cleaned_2022.xlsx"
)# give me colums that have atleast one missing values
missing_cols <- colnames(df_2022)[colSums(is.na(df_2022)) > 0]
missing_cols [1] "1B" "1C" "1D_ii" "1E" "2D" "2E"
[7] "2G" "2D_scaled" "2E_scaled" "2G_scaled" "3A" "4A_i"
[13] "4A_ii" "4D_ii" "5A_i" "5A_ii" "5A_iii" "5A"
[19] "5B_iii" "5B_v" "5C_i" "5C_iii" "5D_ii"
# rows that have at least one missing value
rows_with_na <- df_2022[!complete.cases(df_2022), ]
rows_with_na# A tibble: 57 × 78
Year Countries Economic Freedom Sum…¹ Rank `1A` `1B` `1C` `1D_i` `1D_ii`
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2022 Bahamas,… 6.65 81 6.81 8.47 10 10 10
2 2022 Barbados 6.81 75 7.77 6.65 10 8 3
3 2022 Belarus 5.98 112 4.70 5.85 10 10 5
4 2022 Belize 6 109 5.6 8.43 1.89 9 NA
5 2022 Benin 6.26 97 7.60 9.77 9.04 5 2
6 2022 Bhutan 6.24 98 3.78 9.46 6.04 7 8
7 2022 Bolivia 6.11 106 5.23 8.84 0 10 10
8 2022 Bosnia a… 6.22 102 5.39 5.96 9.66 10 3
9 2022 Brunei D… 6.99 61 0 NA 10 10 10
10 2022 Burundi 5.22 149 3.09 9.97 8.64 8 5
# ℹ 47 more rows
# ℹ abbreviated name: ¹`Economic Freedom Summary Index`
# ℹ 69 more variables: `1D` <dbl>, `1E` <dbl>, Area1 <dbl>, Area1_rank <dbl>,
# `2A` <dbl>, `2B` <dbl>, `2C` <dbl>, `2D` <dbl>, `2E` <dbl>, `2F` <dbl>,
# `2G` <dbl>, `2H` <dbl>, `2_adj` <dbl>, `2_wo_gen` <dbl>,
# `2_with_gen` <dbl>, Area2 <dbl>, Area2_rank <dbl>, `2A_scaled` <labelled>,
# `2B_scaled` <labelled>, `2C_scaled` <labelled>, `2D_scaled` <labelled>, …
# row indices only
which(!complete.cases(df_2022)) [1] 9 12 13 15 16 17 18 19 22 25 26 27 30 31 35 37 44 50 52
[20] 57 62 63 64 65 73 84 85 88 89 90 98 99 102 103 106 108 112 113
[39] 114 116 119 125 128 132 133 137 141 142 145 146 147 150 151 152 161 162 163
# country + columns missing per row
na_map <- lapply(which(!complete.cases(df_2022)), function(i) {
data.frame(
row_id = i,
Countries = df_2022$Countries[i],
missing_cols = paste(names(df_2022)[is.na(df_2022[i, ])], collapse = ", ")
)
})
do.call(rbind, na_map) row_id Countries
1 9 Bahamas, The
2 12 Barbados
3 13 Belarus
4 15 Belize
5 16 Benin
6 17 Bhutan
7 18 Bolivia
8 19 Bosnia and Herzegovina
9 22 Brunei Darussalam
10 25 Burundi
11 26 Cabo Verde
12 27 Cambodia
13 30 Central African Republic
14 31 Chad
15 35 Comoros
16 37 Congo, Rep.
17 44 Djibouti
18 50 Eswatini
19 52 Fiji
20 57 Georgia
21 62 Guinea
22 63 Guinea-Bissau
23 64 Guyana
24 65 Haiti
25 73 Iraq
26 84 Kyrgyz Republic
27 85 Lao PDR
28 88 Lesotho
29 89 Liberia
30 90 Libya
31 98 Mauritania
32 99 Mauritius
33 102 Mongolia
34 103 Montenegro
35 106 Myanmar
36 108 Nepal
37 112 Niger
38 113 Nigeria
39 114 North Macedonia
40 116 Oman
41 119 Papua New Guinea
42 125 Qatar
43 128 Rwanda
44 132 Seychelles
45 133 Sierra Leone
46 137 Somalia
47 141 Sudan
48 142 Suriname
49 145 Syrian Arab Republic
50 146 Taiwan
51 147 Tajikistan
52 150 Timor-Leste
53 151 Togo
54 152 Trinidad and Tobago
55 161 Venezuela, RB
56 162 Vietnam
57 163 Yemen, Rep.
missing_cols
1 1E, 5B_iii, 5C_i, 5C_iii
2 2D, 2D_scaled
3 5B_iii, 5C_i
4 1D_ii, 1E, 2D, 2E, 2D_scaled, 2E_scaled, 5B_iii, 5C_iii
5 2D, 2D_scaled
6 2D, 2D_scaled, 5B_iii
7 5B_v
8 2D, 2D_scaled
9 1B, 1E, 4A_i, 5A_i, 5C_iii
10 2D, 2D_scaled
11 2D, 2D_scaled
12 2D, 2D_scaled, 5A_i
13 2D, 2D_scaled, 5B_iii, 5C_i
14 2D, 2D_scaled, 4A_i
15 1B, 2D, 2D_scaled, 4A_i, 5A_i, 5B_iii, 5C_i
16 5B_iii
17 1B, 2D, 2D_scaled, 3A, 4A_i, 5A_i, 5A_ii, 5B_iii, 5C_i
18 2D, 2D_scaled
19 2D, 2D_scaled, 5B_iii
20 2D, 2D_scaled
21 1B, 5A_i
22 5B_iii
23 1B, 4A_i, 5B_iii
24 4A_i
25 4A_ii, 5A_i, 5B_iii
26 1C, 2D, 2D_scaled
27 2D, 2D_scaled, 5A_i
28 2D, 2D_scaled
29 5B_iii
30 1B, 2G, 2G_scaled, 4A_i, 5A_i, 5B_iii, 5D_ii
31 1B, 2D, 2D_scaled, 4A_i
32 2D, 2D_scaled
33 5A_i
34 1B, 2D, 2D_scaled
35 5B_iii
36 2D, 2D_scaled
37 5B_iii
38 4A_i
39 2D, 2D_scaled
40 5B_v
41 5B_iii
42 1C
43 2D, 2D_scaled
44 2D, 2D_scaled
45 5B_iii
46 1C, 1D_ii, 3A, 5A_i, 5A_ii, 5A_iii, 5A, 5B_iii, 5C_i, 5D_ii
47 5A_i, 5B_iii
48 5B_iii
49 5A_ii, 5A_iii, 5B_iii, 5C_i, 5D_ii
50 4D_ii
51 2D, 2D_scaled
52 2D, 2G, 2D_scaled, 2G_scaled, 5A_i, 5B_iii
53 1D_ii, 5B_iii
54 1C
55 5B_v
56 1B, 5A_i
57 5A_i, 5D_ii
6 Impute Values
6.1 Heirarchy structure
# ---- 1) Print hierarchy (Area -> Component -> Subcomponent) ----
area_components <- list(
Area1 = c("1A","1B","1C","1D","1E"),
Area2 = c("2A_scaled","2B_scaled","2C_scaled","2D_scaled","2E_scaled","2F_scaled","2G_scaled","2H_scaled"),
Area3 = c("3A","3B","3C","3D"),
Area4 = c("4A","4B","4C","4D"),
Area5 = c("5A","5B","5C","5D")
)
sub_components <- list(
`1D` = c("1D_i","1D_ii"),
`4A` = c("4A_i","4A_ii","4A_iii"),
`4B` = c("4B_i","4B_ii"),
`4D` = c("4D_i","4D_ii","4D_iii","4D_iv"),
`5A` = c("5A_i","5A_ii","5A_iii"),
`5B` = c("5B_i","5B_ii","5B_iii","5B_iv","5B_v","5B_vi","5B_vii"),
`5C` = c("5C_i","5C_ii","5C_iii","5C_iv"),
`5D` = c("5D_i","5D_ii","5D_iii")
)
print_hierarchy <- function(area_components, sub_components) {
for (a in names(area_components)) {
cat(a, "\n", sep = "")
for (comp in area_components[[a]]) {
cat(" - ", comp, "\n", sep = "")
if (comp %in% names(sub_components)) {
for (s in sub_components[[comp]]) cat(" * ", s, "\n", sep = "")
}
}
cat("\n")
}
}
print_hierarchy(area_components, sub_components)Area1
- 1A
- 1B
- 1C
- 1D
* 1D_i
* 1D_ii
- 1E
Area2
- 2A_scaled
- 2B_scaled
- 2C_scaled
- 2D_scaled
- 2E_scaled
- 2F_scaled
- 2G_scaled
- 2H_scaled
Area3
- 3A
- 3B
- 3C
- 3D
Area4
- 4A
* 4A_i
* 4A_ii
* 4A_iii
- 4B
* 4B_i
* 4B_ii
- 4C
- 4D
* 4D_i
* 4D_ii
* 4D_iii
* 4D_iv
Area5
- 5A
* 5A_i
* 5A_ii
* 5A_iii
- 5B
* 5B_i
* 5B_ii
* 5B_iii
* 5B_iv
* 5B_v
* 5B_vi
* 5B_vii
- 5C
* 5C_i
* 5C_ii
* 5C_iii
* 5C_iv
- 5D
* 5D_i
* 5D_ii
* 5D_iii
6.2 Imputation functions
# 1) Fill missing COMPONENTS from AREA identity
# Under equal weights: Area = (1/k) * sum(components)
# Rearranged for a missing component:
# target_component = k * Area - sum(other components)
fill_components_from_area <- function(df, area_name, component_names) {
# work on a copy so original data is unchanged
out <- df
# keep only component columns that exist in the data
comps <- intersect(component_names, names(out))
# cannot recover if area is missing or not enough components
if (!(area_name %in% names(out)) || length(comps) < 2) return(out)
# k = number of components used in this area identity
k <- length(comps)
for (target in comps) {
# all components except the one being imputed
others <- setdiff(comps, target)
# candidate rows: target missing but area available
miss <- is.na(out[[target]]) & !is.na(out[[area_name]])
# require all other components to be present for exact algebraic recovery
ok <- miss & complete.cases(out[, others, drop = FALSE])
# apply top-down identity only where recovery is feasible
if (any(ok)) {
out[[target]][ok] <- k * out[[area_name]][ok] -
rowSums(out[ok, others, drop = FALSE])
}
}
out
}
# 1) Components from Areas
df_2022_imp <- df_2022 %>%
fill_components_from_area("Area1", c("1A","1B","1C","1D","1E")) %>%
fill_components_from_area("Area2", c("2A_scaled","2B_scaled","2C_scaled","2D_scaled","2E_scaled","2F_scaled","2G_scaled","2H_scaled")) %>%
fill_components_from_area("Area3", c("3A","3B","3C","3D")) %>%
fill_components_from_area("Area4", c("4A","4B","4C","4D")) %>%
fill_components_from_area("Area5", c("5A","5B","5C","5D"))
vis_dat(df_2022)vis_dat(df_2022_imp)# 2) Fill missing SUBCOMPONENTS from parent COMPONENT
# If a subcomponent is missing but parent component is observed,
# copy the parent value into the missing subcomponent.
# (Consistent with equal-weight aggregation when subcomponents are symmetric.)
fill_subcomponents_from_component <- function(df, component_name, subcomponent_names) {
# work on a copy so original data is unchanged
out <- df
# keep only subcomponent columns that exist
subs <- intersect(subcomponent_names, names(out))
# nothing to do if parent component or subcomponents are absent
if (!(component_name %in% names(out)) || length(subs) == 0) return(out)
for (s in subs) {
# only fill where subcomponent is missing and parent is available
miss <- is.na(out[[s]]) & !is.na(out[[component_name]])
out[[s]][miss] <- out[[component_name]][miss]
}
out
}
# 2) Subcomponents from Components
df_2022_imp <- df_2022_imp %>%
fill_subcomponents_from_component("1D", c("1D_i","1D_ii")) %>%
fill_subcomponents_from_component("4A", c("4A_i","4A_ii","4A_iii")) %>%
fill_subcomponents_from_component("4B", c("4B_i","4B_ii")) %>%
fill_subcomponents_from_component("4D", c("4D_i","4D_ii","4D_iii","4D_iv")) %>%
fill_subcomponents_from_component("5A", c("5A_i","5A_ii","5A_iii")) %>%
fill_subcomponents_from_component("5B", c("5B_i","5B_ii","5B_iii","5B_iv","5B_v","5B_vi","5B_vii")) %>%
fill_subcomponents_from_component("5C", c("5C_i","5C_ii","5C_iii","5C_iv")) %>%
fill_subcomponents_from_component("5D", c("5D_i","5D_ii","5D_iii"))
vis_dat(df_2022)vis_dat(df_2022_imp)# check remaining missing columns
names(df_2022_imp)[colSums(is.na(df_2022_imp)) > 0][1] "1B" "1E" "2D" "2E" "2G" "2D_scaled"
[7] "2E_scaled" "2G_scaled"
# Add one fallback step for rows where top-down cannot recover
# (because too many siblings are missing in same row).
fill_remaining_by_col_median <- function(df, vars) {
out <- df
vars <- intersect(vars, names(out))
for (v in vars) {
med <- median(out[[v]], na.rm = TRUE)
miss <- is.na(out[[v]])
out[[v]][miss] <- med
}
out
}
# apply fallback only to remaining NA columns
remaining <- names(df_2022_imp)[colSums(is.na(df_2022_imp)) > 0]
df_2022_imp <- fill_remaining_by_col_median(df_2022_imp, remaining)
# re-check
names(df_2022_imp)[colSums(is.na(df_2022_imp)) > 0]character(0)
vis_dat(df_2022_imp)# export
write_xlsx(x = df_2022_imp,
path = "../data/clean/economic_freedom_cleaned_2022_imputed.xlsx"
)7 Standardize Subcomponents
Z-scores to adjust for magnitudes.
# Step 1: Create a copy of the original dataset
df_2022_raw <- df_2022_imp
# Step 2: Identify numeric columns to standardize (exclude Year, Areas, and Ranks)
num_cols <- df_2022_raw %>%
select(where(is.numeric)) %>%
names() %>%
setdiff(c("Year",
"Rank",
paste0("Area", 1:5),
paste0("Area", 1:5, "_rank"))) # Exclude Area scores & ranks
# Step 3: Create standardized versions with "_z" suffix
df_2022_std <- df_2022_raw %>%
mutate(across(all_of(num_cols), ~ as.numeric(scale(.)), .names = "{.col}_z"))df_2022_std has Year, Countries, all other numeric columns, z-scored cols and labels preserved.
8 Lavaan Safe
# 1. Rename variables to make them lavaan-safe (prepend "X")
df_2022_std <- df_2022_std %>%
dplyr::rename_with(.fn = ~ paste0("X", .x), .cols = matches("^[0-9]"))9 Summary Stats Table (2022) with standardised vars
# 1. Split into standardized and non-standardized numeric variables
df_std_vars <- df_2022_std %>% select(ends_with("_z"))
df_non_std_vars <- df_2022_std %>%
select(where(is.numeric)) %>%
select(-ends_with("_z"))
# Use the shared label map defined in the setup functions section.
paper_var_labels_summary <- var_label_map
# 2. Descriptive stats for standardized variables
desc_std <- psych::describe(df_std_vars, fast = FALSE)
desc_std_df <- data.frame(desc_std) %>%
rownames_to_column("Variable") %>%
select(Variable, n, mean, sd, min, max) %>%
rename(N = n, Mean = mean, SD = sd, Min = min, Max = max)
# 3. Descriptive stats for non-standardized variables
desc_non_std <- psych::describe(df_non_std_vars, fast = FALSE)
desc_non_std_df <- data.frame(desc_non_std) %>%
rownames_to_column("Variable") %>%
select(Variable, n, mean, sd, min, max) %>%
rename(N = n, Mean = mean, SD = sd, Min = min, Max = max) %>%
mutate(
Variable_Label = dplyr::if_else(
Variable %in% names(paper_var_labels_summary),
unname(paper_var_labels_summary[Variable]),
Variable
),
`Code: Variable` = dplyr::if_else(
Variable %in% names(paper_var_labels_summary),
paste0(Variable, ": ", Variable_Label),
Variable
)
)
# 4. Display: Non-standardized variables
kable(
desc_non_std_df %>% select(`Code: Variable`, N, Mean, SD, Min, Max),
digits = 2,
col.names = c("Code: Variable", "N", "Mean", "SD", "Min", "Max"),
align = "lccccc",
caption = "Summary Statistics – Non-Standardized Variables"
) %>%
kable_styling(full_width = FALSE, position = "center", font_size = 12)| Code: Variable | N | Mean | SD | Min | Max |
|---|---|---|---|---|---|
| Year | 165 | 2022.00 | 0.00 | 2022.00 | 2022.00 |
| Economic Freedom Summary Index | 165 | 6.53 | 1.05 | 3.02 | 8.58 |
| Rank | 165 | 82.78 | 47.80 | 1.00 | 165.00 |
| X1A: Government consumption | 165 | 5.47 | 2.46 | 0.00 | 10.00 |
| X1B: Transfers and subsidies | 165 | 7.45 | 2.05 | 1.82 | 10.00 |
| X1C: Government investment | 165 | 7.12 | 3.24 | 0.00 | 10.00 |
| X1D_i: Top marginal income tax rate | 165 | 7.47 | 2.19 | 2.00 | 10.00 |
| X1D_ii: Top marginal income and payroll tax rate | 165 | 5.52 | 2.63 | 0.00 | 10.00 |
| X1D | 165 | 6.50 | 2.23 | 1.00 | 10.00 |
| X1E: State ownership of assets | 165 | 6.68 | 1.58 | 2.44 | 9.52 |
| Area1 | 165 | 6.64 | 1.12 | 3.62 | 9.06 |
| Area1_rank | 165 | 83.00 | 47.78 | 1.00 | 165.00 |
| X2A | 165 | 5.51 | 1.45 | 2.45 | 8.64 |
| X2B | 165 | 4.66 | 1.89 | 0.83 | 8.84 |
| X2C | 165 | 5.44 | 2.16 | 0.00 | 9.67 |
| X2D | 165 | 6.22 | 2.44 | 0.00 | 10.00 |
| X2E | 165 | 5.83 | 1.84 | 1.70 | 9.80 |
| X2F | 165 | 3.97 | 1.98 | 0.00 | 8.73 |
| X2G | 165 | 7.59 | 1.51 | 2.67 | 9.98 |
| X2H | 165 | 5.21 | 2.42 | 0.00 | 9.77 |
| X2_adj | 165 | 0.87 | 0.17 | 0.29 | 1.00 |
| X2_wo_gen | 165 | 5.53 | 1.66 | 1.98 | 9.10 |
| X2_with_gen | 165 | 5.23 | 1.79 | 1.63 | 9.10 |
| Area2 | 165 | 5.23 | 1.79 | 1.63 | 9.10 |
| Area2_rank | 165 | 83.00 | 47.78 | 1.00 | 165.00 |
| X2A_scaled: Judicial independence* | 165 | 5.20 | 1.59 | 1.97 | 8.64 |
| X2B_scaled: Impartial courts* | 165 | 4.42 | 1.96 | 0.68 | 8.84 |
| X2C_scaled: Property rights* | 165 | 5.16 | 2.23 | 0.00 | 9.67 |
| X2D_scaled: Military interference* | 165 | 5.77 | 2.61 | 0.00 | 10.00 |
| X2E_scaled: Legal integrity* | 165 | 5.52 | 1.97 | 1.55 | 9.80 |
| X2F_scaled: Contract enforcement* | 165 | 3.78 | 2.02 | 0.00 | 8.73 |
| X2G_scaled: Real property restrictions* | 165 | 7.10 | 1.69 | 2.35 | 9.98 |
| X2H_scaled: Police reliability* | 165 | 4.92 | 2.38 | 0.00 | 9.77 |
| X3A: Money growth | 165 | 8.25 | 1.61 | 0.00 | 9.98 |
| X3B: Inflation volatility | 165 | 7.65 | 2.52 | 0.00 | 9.74 |
| X3C: Recent inflation | 165 | 5.88 | 2.61 | 0.00 | 9.46 |
| X3D: Foreign currency accounts | 165 | 7.15 | 3.95 | 0.00 | 10.00 |
| Area3 | 165 | 7.23 | 1.71 | 0.74 | 9.55 |
| Area3_rank | 165 | 82.99 | 47.77 | 1.00 | 165.00 |
| X4A_i: Trade tax revenue | 165 | 8.57 | 1.48 | 3.33 | 10.00 |
| X4A_ii: Mean tariff rate | 165 | 8.24 | 1.05 | 3.50 | 10.00 |
| X4A_iii: Tariff dispersion | 165 | 6.06 | 2.13 | 0.00 | 10.00 |
| X4A | 165 | 7.62 | 1.18 | 3.92 | 10.00 |
| X4B_i: Non-tariff barriers | 165 | 5.75 | 1.80 | 0.00 | 9.18 |
| X4B_ii: Trade compliance costs | 165 | 6.46 | 3.01 | 0.00 | 9.98 |
| X4B | 165 | 6.11 | 2.11 | 1.69 | 9.41 |
| X4C: Black market exchange rates | 165 | 9.22 | 2.47 | 0.00 | 10.00 |
| X4D_i: Financial openness | 165 | 5.84 | 3.02 | 0.00 | 10.00 |
| X4D_ii: Capital controls | 165 | 3.54 | 2.76 | 0.00 | 10.00 |
| X4D_iii: Freedom of foreigners to visit | 165 | 7.10 | 3.38 | 0.00 | 10.00 |
| X4D_iv: Protection of foreign assets | 165 | 5.73 | 2.10 | 0.00 | 9.27 |
| X4D | 165 | 5.55 | 2.15 | 0.69 | 9.40 |
| Area4 | 165 | 7.12 | 1.49 | 2.48 | 9.66 |
| Area4_rank | 165 | 83.00 | 47.78 | 1.00 | 165.00 |
| X5A_i: Ownership of banks | 165 | 7.53 | 2.78 | 0.00 | 10.00 |
| X5A_ii: Private sector credit | 165 | 7.35 | 2.56 | 0.00 | 10.00 |
| X5A_iii: Interest rate controls | 165 | 8.29 | 2.33 | 0.00 | 10.00 |
| X5A | 165 | 7.72 | 1.65 | 0.00 | 10.00 |
| X5B_i: Hiring regulations and minimum wage | 165 | 5.81 | 1.80 | 1.25 | 9.45 |
| X5B_ii: Hiring and firing regulations | 165 | 5.64 | 1.97 | 0.00 | 10.00 |
| X5B_iii: Centralized collective bargaining | 165 | 6.40 | 1.26 | 1.61 | 8.67 |
| X5B_iv: Hours regulations | 165 | 7.84 | 1.94 | 2.00 | 10.00 |
| X5B_v: Mandated cost of worker dismissal | 165 | 6.45 | 2.83 | 0.00 | 10.00 |
| X5B_vi: Conscription | 165 | 6.61 | 4.20 | 0.00 | 10.00 |
| X5B_vii: Foreign labor | 165 | 5.02 | 1.49 | 0.00 | 8.14 |
| X5B | 165 | 6.25 | 1.22 | 2.67 | 9.14 |
| X5C_i: Regulatory burden | 165 | 4.11 | 1.26 | 0.86 | 7.44 |
| X5C_ii: Bureaucracy costs | 165 | 5.04 | 2.22 | 0.00 | 9.56 |
| X5C_iii: Impartial public administration | 165 | 5.63 | 2.46 | 0.34 | 9.91 |
| X5C_iv: Tax compliance | 165 | 5.90 | 1.92 | 0.00 | 9.87 |
| X5C | 165 | 5.17 | 1.55 | 0.60 | 9.05 |
| X5D_i: Market openness | 165 | 6.07 | 1.80 | 1.24 | 10.00 |
| X5D_ii: Business permits | 165 | 8.28 | 1.34 | 2.63 | 10.00 |
| X5D_iii: Distortion of business environment | 165 | 5.20 | 2.29 | 0.00 | 10.00 |
| X5D | 165 | 6.52 | 1.44 | 2.59 | 9.86 |
| Area5 | 165 | 6.41 | 1.13 | 2.54 | 8.86 |
| Area5_rank | 165 | 83.00 | 47.78 | 1.00 | 165.00 |
# 5. Display: Standardized variables
kable(
desc_std_df,
digits = 2,
col.names = c("Variable", "N", "Mean", "SD", "Min", "Max"),
align = "lccccc",
caption = "Summary Statistics – Standardized Variables (_z)"
) %>%
kable_styling(full_width = FALSE, position = "center", font_size = 12)| Variable | N | Mean | SD | Min | Max |
|---|---|---|---|---|---|
| Economic Freedom Summary Index_z | 165 | 0 | 1 | -3.35 | 1.96 |
| X1A_z | 165 | 0 | 1 | -2.23 | 1.84 |
| X1B_z | 165 | 0 | 1 | -2.74 | 1.25 |
| X1C_z | 165 | 0 | 1 | -2.20 | 0.89 |
| X1D_i_z | 165 | 0 | 1 | -2.49 | 1.15 |
| X1D_ii_z | 165 | 0 | 1 | -2.10 | 1.70 |
| X1D_z | 165 | 0 | 1 | -2.47 | 1.57 |
| X1E_z | 165 | 0 | 1 | -2.69 | 1.80 |
| X2A_z | 165 | 0 | 1 | -2.12 | 2.16 |
| X2B_z | 165 | 0 | 1 | -2.03 | 2.22 |
| X2C_z | 165 | 0 | 1 | -2.52 | 1.96 |
| X2D_z | 165 | 0 | 1 | -2.55 | 1.55 |
| X2E_z | 165 | 0 | 1 | -2.24 | 2.15 |
| X2F_z | 165 | 0 | 1 | -2.00 | 2.40 |
| X2G_z | 165 | 0 | 1 | -3.26 | 1.58 |
| X2H_z | 165 | 0 | 1 | -2.15 | 1.89 |
| X2_adj_z | 165 | 0 | 1 | -3.45 | 0.77 |
| X2_wo_gen_z | 165 | 0 | 1 | -2.13 | 2.15 |
| X2_with_gen_z | 165 | 0 | 1 | -2.01 | 2.16 |
| X2A_scaled_z | 165 | 0 | 1 | -2.04 | 2.16 |
| X2B_scaled_z | 165 | 0 | 1 | -1.91 | 2.26 |
| X2C_scaled_z | 165 | 0 | 1 | -2.31 | 2.02 |
| X2D_scaled_z | 165 | 0 | 1 | -2.21 | 1.62 |
| X2E_scaled_z | 165 | 0 | 1 | -2.02 | 2.18 |
| X2F_scaled_z | 165 | 0 | 1 | -1.87 | 2.46 |
| X2G_scaled_z | 165 | 0 | 1 | -2.82 | 1.71 |
| X2H_scaled_z | 165 | 0 | 1 | -2.07 | 2.04 |
| X3A_z | 165 | 0 | 1 | -5.14 | 1.08 |
| X3B_z | 165 | 0 | 1 | -3.04 | 0.83 |
| X3C_z | 165 | 0 | 1 | -2.25 | 1.37 |
| X3D_z | 165 | 0 | 1 | -1.81 | 0.72 |
| X4A_i_z | 165 | 0 | 1 | -3.54 | 0.97 |
| X4A_ii_z | 165 | 0 | 1 | -4.52 | 1.68 |
| X4A_iii_z | 165 | 0 | 1 | -2.85 | 1.85 |
| X4A_z | 165 | 0 | 1 | -3.15 | 2.02 |
| X4B_i_z | 165 | 0 | 1 | -3.19 | 1.90 |
| X4B_ii_z | 165 | 0 | 1 | -2.14 | 1.17 |
| X4B_z | 165 | 0 | 1 | -2.09 | 1.57 |
| X4C_z | 165 | 0 | 1 | -3.73 | 0.32 |
| X4D_i_z | 165 | 0 | 1 | -1.94 | 1.38 |
| X4D_ii_z | 165 | 0 | 1 | -1.28 | 2.34 |
| X4D_iii_z | 165 | 0 | 1 | -2.10 | 0.86 |
| X4D_iv_z | 165 | 0 | 1 | -2.73 | 1.69 |
| X4D_z | 165 | 0 | 1 | -2.27 | 1.79 |
| X5A_i_z | 165 | 0 | 1 | -2.70 | 0.89 |
| X5A_ii_z | 165 | 0 | 1 | -2.87 | 1.03 |
| X5A_iii_z | 165 | 0 | 1 | -3.56 | 0.74 |
| X5A_z | 165 | 0 | 1 | -4.68 | 1.38 |
| X5B_i_z | 165 | 0 | 1 | -2.54 | 2.02 |
| X5B_ii_z | 165 | 0 | 1 | -2.87 | 2.21 |
| X5B_iii_z | 165 | 0 | 1 | -3.81 | 1.81 |
| X5B_iv_z | 165 | 0 | 1 | -3.01 | 1.11 |
| X5B_v_z | 165 | 0 | 1 | -2.28 | 1.25 |
| X5B_vi_z | 165 | 0 | 1 | -1.57 | 0.81 |
| X5B_vii_z | 165 | 0 | 1 | -3.36 | 2.09 |
| X5B_z | 165 | 0 | 1 | -2.93 | 2.36 |
| X5C_i_z | 165 | 0 | 1 | -2.57 | 2.64 |
| X5C_ii_z | 165 | 0 | 1 | -2.27 | 2.04 |
| X5C_iii_z | 165 | 0 | 1 | -2.15 | 1.74 |
| X5C_iv_z | 165 | 0 | 1 | -3.07 | 2.07 |
| X5C_z | 165 | 0 | 1 | -2.95 | 2.51 |
| X5D_i_z | 165 | 0 | 1 | -2.69 | 2.19 |
| X5D_ii_z | 165 | 0 | 1 | -4.20 | 1.28 |
| X5D_iii_z | 165 | 0 | 1 | -2.27 | 2.10 |
| X5D_z | 165 | 0 | 1 | -2.73 | 2.33 |
# export
write_xlsx(x = df_2022_std,
path = "../data/clean/economic_freedom_cleaned_2022_standardised.xlsx")
# Paper export: use the non-standardized summary stats (not desc_std_df)
dir.create("../paper_draft/tables", recursive = TRUE, showWarnings = FALSE)
write.csv(
desc_non_std_df %>% select(`Code: Variable`, N, Mean, SD, Min, Max),
"../paper_draft/tables/paper_summary_stats_nonstandardized.csv",
row.names = FALSE
)
# Reader-friendly appendix table: keep only Component/Subcomponent variables that have explicit labels in `var_label_map`
write.csv(
desc_non_std_df %>%
filter(Variable %in% names(var_label_map)) %>%
select(`Code: Variable`, N, Mean, SD, Min, Max),
"../paper_draft/tables/paper_summary_stats_nonstandardized_labeled_v7.csv",
row.names = FALSE
)
#str(df_2022_std)10 EXCEL REPLICATION
- Write
rmeanfunction to compute row means across multiple columns
rmean <- function(...){
rowMeans(cbind(...), na.rm = TRUE)
}
#rmean(df_2022_std$X1A, df_2022_std$X1B)11 METHOD 3: Direct Weighted Average on Sub-Components
# Create manual EFI and rank
df_2022_manual_subcomponent <- df_2022_std %>%
mutate(
EFI_Manual_subcomponent =
0.20 * rmean(x = X1A,
X1B,
X1C,
rmean(X1D_i, X1D_ii),
X1E
) +
0.20 * rmean(x = X2A_scaled,
X2B_scaled,
X2C_scaled,
X2D_scaled,
X2E_scaled,
X2F_scaled,
X2G_scaled,
X2H_scaled
) +
0.20 * rmean(x = X3A,
X3B,
X3C,
X3D
) +
0.20 * rmean(x = rmean(X4A_i, X4A_ii, X4A_iii),
rmean(X4B_i, X4B_ii), X4C,
rmean(X4D_i, X4D_ii, X4D_iii, X4D_iv)
) +
0.20 * rmean(x = rmean(X5A_i, X5A_ii, X5A_iii),
rmean(X5B_i, X5B_ii, X5B_iii, X5B_iv, X5B_v, X5B_vi, X5B_vii),
rmean(X5C_i, X5C_ii, X5C_iii, X5C_iv),
rmean(X5D_i, X5D_ii, X5D_iii)
)
)
# Option 1 (original): strict sequential rank (no tie handling)
# df_2022_manual_subcomponent <- df_2022_manual_subcomponent %>%
# arrange(desc(EFI_Manual_subcomponent)) %>%
# mutate(Rank_Manual_subcomponent = row_number())
# Option 2 (recommended): tie-aware rank on displayed 2dp score (6,6,8 style)
df_2022_manual_subcomponent <- df_2022_manual_subcomponent %>%
mutate(
EFI_Manual_subcomponent_2dp = round(EFI_Manual_subcomponent, 2),
Rank_Manual_subcomponent = min_rank(desc(EFI_Manual_subcomponent_2dp))
) %>%
arrange(Rank_Manual_subcomponent, desc(EFI_Manual_subcomponent_2dp), Countries)
# Select relevant columns for comparison
rank_comparison_subcomponent <- df_2022_manual_subcomponent %>%
select(Countries, Rank, Rank_Manual_subcomponent, `Economic Freedom Summary Index`, EFI_Manual_subcomponent)
# Display nicely
kable(
rank_comparison_subcomponent,
digits = 2,
col.names = c("Country", "Original Rank", "Manual Rank", "Economic Freedom Summary Index","Manual EFI Score"),
caption = "Comparison of Original vs. Manual EFI Rankings"
) %>%
kable_styling(full_width = FALSE, position = "center", font_size = 12)| Country | Original Rank | Manual Rank | Economic Freedom Summary Index | Manual EFI Score |
|---|---|---|---|---|
| Hong Kong SAR, China | 1 | 1 | 8.58 | 8.58 |
| Singapore | 2 | 2 | 8.55 | 8.55 |
| Switzerland | 3 | 3 | 8.43 | 8.43 |
| New Zealand | 4 | 4 | 8.39 | 8.39 |
| United States | 5 | 5 | 8.09 | 8.09 |
| Denmark | 6 | 6 | 8.02 | 8.02 |
| Ireland | 6 | 6 | 8.02 | 8.02 |
| Canada | 8 | 8 | 7.99 | 7.99 |
| Australia | 9 | 9 | 7.98 | 7.98 |
| Luxembourg | 9 | 9 | 7.98 | 7.98 |
| Japan | 11 | 11 | 7.90 | 7.90 |
| United Kingdom | 12 | 12 | 7.88 | 7.88 |
| Finland | 13 | 13 | 7.87 | 7.87 |
| Iceland | 14 | 14 | 7.84 | 7.84 |
| Malta | 15 | 15 | 7.82 | 7.82 |
| Germany | 16 | 16 | 7.80 | 7.80 |
| Mauritius | 17 | 17 | 7.79 | 7.79 |
| Netherlands | 18 | 18 | 7.74 | 7.74 |
| Taiwan | 19 | 19 | 7.71 | 7.71 |
| Estonia | 20 | 20 | 7.67 | 7.67 |
| Georgia | 21 | 21 | 7.66 | 7.66 |
| Czechia | 22 | 22 | 7.65 | 7.65 |
| Austria | 23 | 23 | 7.62 | 7.62 |
| Panama | 23 | 23 | 7.62 | 7.62 |
| Sweden | 25 | 25 | 7.61 | 7.61 |
| Costa Rica | 26 | 26 | 7.60 | 7.60 |
| Portugal | 27 | 27 | 7.59 | 7.59 |
| Norway | 28 | 28 | 7.58 | 7.58 |
| Malaysia | 29 | 29 | 7.56 | 7.56 |
| Spain | 30 | 30 | 7.54 | 7.54 |
| Cyprus | 31 | 31 | 7.53 | 7.53 |
| Korea, Rep. | 32 | 32 | 7.52 | 7.52 |
| Latvia | 32 | 32 | 7.52 | 7.52 |
| Bahrain | 34 | 34 | 7.51 | 7.51 |
| Lithuania | 34 | 34 | 7.51 | 7.51 |
| Armenia | 36 | 36 | 7.49 | 7.49 |
| France | 36 | 36 | 7.49 | 7.49 |
| Albania | 38 | 38 | 7.48 | 7.48 |
| Chile | 39 | 39 | 7.45 | 7.45 |
| Guatemala | 39 | 39 | 7.45 | 7.45 |
| Israel | 41 | 41 | 7.44 | 7.44 |
| Belgium | 42 | 42 | 7.42 | 7.42 |
| Peru | 43 | 43 | 7.41 | 7.41 |
| Romania | 43 | 43 | 7.41 | 7.41 |
| Slovak Republic | 45 | 45 | 7.39 | 7.39 |
| United Arab Emirates | 45 | 45 | 7.39 | 7.39 |
| Cabo Verde | 47 | 47 | 7.37 | 7.37 |
| Jordan | 48 | 48 | 7.33 | 7.33 |
| Dominican Republic | 49 | 49 | 7.32 | 7.32 |
| Jamaica | 49 | 49 | 7.32 | 7.32 |
| Italy | 51 | 51 | 7.22 | 7.22 |
| Bulgaria | 52 | 52 | 7.16 | 7.16 |
| Montenegro | 53 | 53 | 7.15 | 7.15 |
| Slovenia | 54 | 54 | 7.14 | 7.14 |
| Hungary | 55 | 55 | 7.12 | 7.12 |
| Croatia | 56 | 56 | 7.07 | 7.07 |
| Brunei Darussalam | 61 | 57 | 6.99 | 7.06 |
| Uruguay | 57 | 58 | 7.04 | 7.04 |
| Saudi Arabia | 58 | 59 | 7.03 | 7.03 |
| El Salvador | 59 | 60 | 7.01 | 7.01 |
| Philippines | 59 | 60 | 7.01 | 7.01 |
| Paraguay | 62 | 62 | 6.98 | 6.98 |
| Seychelles | 62 | 62 | 6.98 | 6.98 |
| Indonesia | 64 | 64 | 6.96 | 6.96 |
| Mexico | 65 | 65 | 6.94 | 6.94 |
| Thailand | 65 | 65 | 6.94 | 6.94 |
| Honduras | 67 | 67 | 6.92 | 6.92 |
| Kazakhstan | 68 | 68 | 6.89 | 6.89 |
| Mongolia | 69 | 69 | 6.86 | 6.86 |
| Cambodia | 70 | 70 | 6.85 | 6.85 |
| Gambia, The | 70 | 70 | 6.85 | 6.85 |
| Greece | 70 | 70 | 6.85 | 6.85 |
| Poland | 70 | 70 | 6.85 | 6.85 |
| Kenya | 74 | 74 | 6.82 | 6.82 |
| Barbados | 75 | 75 | 6.81 | 6.81 |
| Uganda | 76 | 76 | 6.75 | 6.75 |
| North Macedonia | 77 | 77 | 6.74 | 6.74 |
| Qatar | 78 | 78 | 6.71 | 6.71 |
| Serbia | 79 | 79 | 6.70 | 6.70 |
| Botswana | 80 | 80 | 6.67 | 6.67 |
| Bahamas, The | 81 | 81 | 6.65 | 6.65 |
| South Africa | 81 | 81 | 6.65 | 6.65 |
| Trinidad and Tobago | 83 | 83 | 6.64 | 6.64 |
| India | 84 | 84 | 6.58 | 6.58 |
| Brazil | 85 | 85 | 6.56 | 6.56 |
| Colombia | 86 | 86 | 6.54 | 6.54 |
| Nepal | 86 | 86 | 6.54 | 6.54 |
| Kyrgyz Republic | 88 | 88 | 6.53 | 6.53 |
| Moldova | 89 | 89 | 6.49 | 6.49 |
| Morocco | 90 | 90 | 6.46 | 6.46 |
| Tajikistan | 90 | 90 | 6.46 | 6.46 |
| Kuwait | 92 | 92 | 6.45 | 6.45 |
| Rwanda | 92 | 92 | 6.45 | 6.45 |
| Nicaragua | 94 | 94 | 6.42 | 6.42 |
| Oman | 95 | 95 | 6.40 | 6.40 |
| Tanzania | 95 | 95 | 6.40 | 6.40 |
| Benin | 97 | 97 | 6.26 | 6.26 |
| Bhutan | 98 | 98 | 6.24 | 6.24 |
| Djibouti | 99 | 99 | 6.23 | 6.23 |
| Namibia | 99 | 99 | 6.23 | 6.23 |
| Vietnam | 99 | 99 | 6.23 | 6.23 |
| Bosnia and Herzegovina | 102 | 102 | 6.22 | 6.22 |
| Zambia | 103 | 103 | 6.17 | 6.17 |
| China | 104 | 104 | 6.14 | 6.14 |
| Ecuador | 104 | 104 | 6.14 | 6.14 |
| Bolivia | 106 | 106 | 6.11 | 6.11 |
| Belize | 109 | 107 | 6.00 | 6.08 |
| Togo | 107 | 108 | 6.07 | 6.07 |
| Papua New Guinea | 108 | 109 | 6.02 | 6.02 |
| Fiji | 110 | 110 | 5.99 | 5.99 |
| Senegal | 110 | 110 | 5.99 | 5.99 |
| Belarus | 112 | 112 | 5.98 | 5.98 |
| Ghana | 113 | 113 | 5.96 | 5.96 |
| Lesotho | 113 | 113 | 5.96 | 5.96 |
| Nigeria | 113 | 113 | 5.96 | 5.96 |
| Madagascar | 116 | 116 | 5.95 | 5.95 |
| Somalia | 116 | 116 | 5.95 | 5.95 |
| Tunisia | 118 | 118 | 5.94 | 5.94 |
| Russian Federation | 119 | 119 | 5.93 | 5.93 |
| Mauritania | 120 | 120 | 5.91 | 5.91 |
| Burkina Faso | 121 | 121 | 5.90 | 5.90 |
| Timor-Leste | 130 | 121 | 5.77 | 5.90 |
| Côte d'Ivoire | 122 | 123 | 5.89 | 5.89 |
| Liberia | 123 | 124 | 5.87 | 5.87 |
| Sri Lanka | 123 | 124 | 5.87 | 5.87 |
| Lao PDR | 125 | 126 | 5.86 | 5.86 |
| Mozambique | 125 | 126 | 5.86 | 5.86 |
| Bangladesh | 127 | 128 | 5.85 | 5.85 |
| Azerbaijan | 128 | 129 | 5.80 | 5.80 |
| Haiti | 128 | 129 | 5.80 | 5.80 |
| Niger | 131 | 131 | 5.70 | 5.70 |
| Guinea | 132 | 132 | 5.63 | 5.63 |
| Mali | 132 | 132 | 5.63 | 5.63 |
| Pakistan | 134 | 134 | 5.62 | 5.62 |
| Cameroon | 135 | 135 | 5.61 | 5.61 |
| Guyana | 136 | 136 | 5.60 | 5.60 |
| Sierra Leone | 137 | 137 | 5.59 | 5.59 |
| Türkiye | 138 | 138 | 5.57 | 5.57 |
| Comoros | 139 | 139 | 5.56 | 5.56 |
| Malawi | 140 | 140 | 5.48 | 5.48 |
| Suriname | 141 | 141 | 5.46 | 5.46 |
| Eswatini | 142 | 142 | 5.42 | 5.42 |
| Egypt, Arab Rep. | 143 | 143 | 5.36 | 5.36 |
| Lebanon | 144 | 144 | 5.35 | 5.35 |
| Guinea-Bissau | 145 | 145 | 5.32 | 5.32 |
| Gabon | 146 | 146 | 5.28 | 5.28 |
| Iraq | 147 | 147 | 5.25 | 5.25 |
| Ethiopia | 148 | 148 | 5.24 | 5.24 |
| Burundi | 149 | 149 | 5.22 | 5.22 |
| Ukraine | 150 | 150 | 5.12 | 5.12 |
| Chad | 151 | 151 | 5.05 | 5.05 |
| Congo, Dem. Rep. | 152 | 152 | 5.02 | 5.02 |
| Central African Republic | 153 | 153 | 4.92 | 4.92 |
| Congo, Rep. | 154 | 154 | 4.89 | 4.89 |
| Angola | 155 | 155 | 4.79 | 4.79 |
| Yemen, Rep. | 156 | 156 | 4.68 | 4.68 |
| Libya | 157 | 157 | 4.65 | 4.65 |
| Iran, Islamic Rep. | 158 | 158 | 4.63 | 4.63 |
| Argentina | 159 | 159 | 4.55 | 4.55 |
| Myanmar | 160 | 160 | 4.54 | 4.54 |
| Algeria | 161 | 161 | 4.46 | 4.46 |
| Syrian Arab Republic | 162 | 162 | 4.28 | 4.28 |
| Sudan | 163 | 163 | 4.11 | 4.11 |
| Zimbabwe | 164 | 164 | 3.53 | 3.53 |
| Venezuela, RB | 165 | 165 | 3.02 | 3.02 |
The Economic Freedom Index claims to weight its five major Areas equally at 20% each, but this approach masks a more fundamental problem: the implicit weighting of individual Components and Subcomponents is highly unequal and seemingly arbitrary. Because each Area contains different numbers of Components and Subcomponents, individual measures end up with vastly different influence on the overall score. For instance, the government consumption Component (X1A) carries approximately 4% weight in the final index, while each labor-regulation Subcomponent (X5B_i through X5B_vii) carries only about 0.71% weight (a 5.6x difference). This means a single government-spending measure has much more influence than any one labor-market Subcomponent, despite no clear theoretical justification for why one should matter more than the other. Equal weighting at the Area level can obscure large disparities in the implied weights applied to Components and Subcomponents.
12 MODELS FOR WEIGHTS
12.1 EFA Theory
EFA is used when one don’t know the underlying structure of your data. It’s a data-driven approach where you:
Let the data reveal patterns and groupings
Discover which variables cluster together
Determine how many factors (latent variables) exist
See which items load onto which factors
12.2 CFA Theory
CFA is used when one already have a theory about the structure. It’s a theory-testing approach where you:
Specify in advance how many factors exist
Define which items should load on which factors
Test whether your proposed model fits the actual data
Evaluate if your theoretical structure is supporte
13 Implementation
Workflow used in this notebook:
- Run EFA to evaluate the latent structure in the Component/Subcomponent data.
- Derive empirical factor weights and compare country rankings.
- Run CFA to compare a data-driven 4-factor structure against the EFI-style 5-factor structure.
- Interpret results with emphasis on measurement fit and ranking sensitivity.
14 Local EFA/CFA Helpers
These small helper functions are defined inside this notebook so the post-Implementation analysis is self-contained and does not depend on external module files.
15 Area-Wise Correlations Before Factor Analysis
This section is a descriptive check before running EFA/CFA. It shows how indicators co-move within each EFI area using the cleaned 2022 dataset already created above.
Why this is useful:
- It gives an intuitive view of whether indicators inside each area appear to measure a common underlying dimension.
- It can reveal weakly related or unusually behaving indicators before factor analysis.
- It helps motivate why an empirical factor model may differ from the imposed five-area structure.
# Use the same Component/Subcomponent variables that feed the later factor analysis.
# For Area 2, use the gender-adjusted scaled indicators to match the EFI-adjusted measurement used elsewhere.
area1_corr <- df_2022_manual_subcomponent %>%
select(X1A, X1B, X1C, X1D_i, X1D_ii, X1E)
area2_corr <- df_2022_manual_subcomponent %>%
select(X2A_scaled, X2B_scaled, X2C_scaled, X2D_scaled, X2E_scaled, X2F_scaled, X2G_scaled, X2H_scaled)
area3_corr <- df_2022_manual_subcomponent %>%
select(X3A, X3B, X3C, X3D)
area4_corr <- df_2022_manual_subcomponent %>%
select(X4A_i, X4A_ii, X4A_iii, X4B_i, X4B_ii, X4C, X4D_i, X4D_ii, X4D_iii, X4D_iv)
area5_corr <- df_2022_manual_subcomponent %>%
select(
X5A_i, X5A_ii, X5A_iii,
X5B_i, X5B_ii, X5B_iii, X5B_iv, X5B_v, X5B_vi, X5B_vii,
X5C_i, X5C_ii, X5C_iii, X5C_iv,
X5D_i, X5D_ii, X5D_iii
)
# Single source of truth for Component/Subcomponent labels used in tables/figures/exports.
# Use `code: label` names in figures/tables so the paper is easier to read (analysis logic is unchanged).
efa_var_labels <- var_label_map
code_label_names <- function(df) {
current <- names(df)
mapped <- ifelse(current %in% names(efa_var_labels), paste0(current, ": ", efa_var_labels[current]), current)
names(df) <- mapped
df
}
area1_corr <- code_label_names(area1_corr)
area2_corr <- code_label_names(area2_corr)
area3_corr <- code_label_names(area3_corr)
area4_corr <- code_label_names(area4_corr)
area5_corr <- code_label_names(area5_corr)
pairwise_corr_range <- function(df, area_label) {
cm <- cor(df, use = "pairwise.complete.obs")
vals <- cm[lower.tri(cm, diag = FALSE)]
tibble::tibble(
Area = area_label,
MinCorr = min(vals, na.rm = TRUE),
MaxCorr = max(vals, na.rm = TRUE)
)
}
lapply(
list(Area1 = area1_corr, Area2 = area2_corr, Area3 = area3_corr, Area4 = area4_corr, Area5 = area5_corr),
dim
)$Area1
[1] 165 6
$Area2
[1] 165 8
$Area3
[1] 165 4
$Area4
[1] 165 10
$Area5
[1] 165 17
15.1 Area 1: Size of Government (Within-Area Correlations)
corplot(area1_corr, sz = 2.2, title = "Area 1: Size of Government")Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the ggcorrplot package.
Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
15.2 Area 2: Legal System and Property Rights (Within-Area Correlations)
corplot(area2_corr, sz = 2.0, title = "Area 2: Legal System and Property Rights (scaled)")15.3 Area 3: Sound Money (Within-Area Correlations)
corplot(area3_corr, sz = 2.5, title = "Area 3: Sound Money")15.4 Area 4: Freedom to Trade Internationally (Within-Area Correlations)
corplot(area4_corr, sz = 1.6, title = "Area 4: Freedom to Trade Internationally")15.5 Area 5: Regulation (Within-Area Correlations)
# Area 5 has many indicators, so labels and coefficients are smaller to keep the plot readable.
corplot(area5_corr, sz = 1.0, title = "Area 5: Regulation")15.6 Correlations Across the Five Official EFI Area Scores
area_scores_corr <- df_2022_manual_subcomponent %>%
select(Area1, Area2, Area3, Area4, Area5) %>%
rename(
`Area 1` = Area1,
`Area 2` = Area2,
`Area 3` = Area3,
`Area 4` = Area4,
`Area 5` = Area5
)
corplot(area_scores_corr, sz = 3.0, title = "Correlations Across the Five Official EFI Area Scores")Interpretation notes:
- Strong positive clustering within an area supports (but does not prove) that the area measures a coherent dimension.
- Weak or mixed correlations within an area can signal that equal aggregation may be combining distinct constructs.
- These plots are descriptive; EFA/CFA below provide the formal measurement tests.
# Export paper assets: correlation heatmaps (same style) + correlation ranges for chart notes.
dir.create("../paper_draft/images", recursive = TRUE, showWarnings = FALSE)
ggsave("../paper_draft/images/area1_corr_heatmap.png",
plot = corplot(area1_corr, sz = 2.2, title = "Area 1: Size of Government"),
width = 9.5, height = 7.5, dpi = 300)
ggsave("../paper_draft/images/area2_corr_heatmap.png",
plot = corplot(area2_corr, sz = 2.0, title = "Area 2: Legal System and Property Rights (scaled)"),
width = 10.5, height = 8.5, dpi = 300)
ggsave("../paper_draft/images/area3_corr_heatmap.png",
plot = corplot(area3_corr, sz = 2.5, title = "Area 3: Sound Money"),
width = 7, height = 5.5, dpi = 300)
ggsave("../paper_draft/images/area4_corr_heatmap.png",
plot = corplot(area4_corr, sz = 1.6, title = "Area 4: Freedom to Trade Internationally"),
width = 10, height = 8, dpi = 300)
ggsave("../paper_draft/images/area5_corr_heatmap.png",
plot = corplot(area5_corr, sz = 1.0, title = "Area 5: Regulation"),
width = 12, height = 10, dpi = 300)
ggsave("../paper_draft/images/area_scores_corr_heatmap.png",
plot = corplot(area_scores_corr, sz = 3.0, title = "Correlations Across the Five Official EFI Area Scores"),
width = 8.5, height = 7, dpi = 300)
dir.create("../paper_draft/tables", recursive = TRUE, showWarnings = FALSE)
write.csv(cor(area_scores_corr, use = "pairwise.complete.obs"),
"../paper_draft/tables/paper_area_scores_corr_matrix_v7.csv",
row.names = TRUE)
paper_corr_ranges <- dplyr::bind_rows(
pairwise_corr_range(area1_corr, "Area 1"),
pairwise_corr_range(area2_corr, "Area 2"),
pairwise_corr_range(area3_corr, "Area 3"),
pairwise_corr_range(area4_corr, "Area 4"),
pairwise_corr_range(area5_corr, "Area 5")
) %>%
mutate(across(c(MinCorr, MaxCorr), ~ round(.x, 3)))
write.csv(paper_corr_ranges, "../paper_draft/tables/paper_corr_ranges_v7.csv", row.names = FALSE)
paper_corr_ranges# A tibble: 5 × 3
Area MinCorr MaxCorr
<chr> <dbl> <dbl>
1 Area 1 -0.245 0.704
2 Area 2 0.394 0.924
3 Area 3 0.038 0.605
4 Area 4 -0.041 0.763
5 Area 5 -0.082 0.84
16 Run EFA
16.1 Build Indicator Matrix and Run Diagnostics
# Indicator-level EFA input (same specification as the original analysis logic)
efa_data <- df_2022_manual_subcomponent %>%
select(
# Area 1 (6)
X1A, X1B, X1C, X1D_i, X1D_ii, X1E,
# Area 2 (8, scaled)
X2A_scaled, X2B_scaled, X2C_scaled, X2D_scaled,
X2E_scaled, X2F_scaled, X2G_scaled, X2H_scaled,
# Area 3 (4)
X3A, X3B, X3C, X3D,
# Area 4 (10)
X4A_i, X4A_ii, X4A_iii,
X4B_i, X4B_ii,
X4C,
X4D_i, X4D_ii, X4D_iii, X4D_iv,
# Area 5 (17)
X5A_i, X5A_ii, X5A_iii,
X5B_i, X5B_ii, X5B_iii, X5B_iv, X5B_v, X5B_vi, X5B_vii,
X5C_i, X5C_ii, X5C_iii, X5C_iv,
X5D_i, X5D_ii, X5D_iii
)
efa_data_complete <- na.omit(efa_data)
cor_matrix <- cor(efa_data_complete, use = "pairwise.complete.obs")
kmo_result <- KMO(cor_matrix)
bartlett_result <- cortest.bartlett(cor_matrix, n = nrow(efa_data_complete))
# Drop the same four low-MSA variables used in your earlier analysis
low_msa_drop <- c("X5B_iv", "X5B_vi", "X5A_iii", "X4A_iii")
efa_data_clean <- efa_data_complete %>% select(-all_of(low_msa_drop))
# Return key dimensions/missingness as actual object output
rbind(
`Raw EFA matrix` = dim(efa_data),
`Complete-case EFA matrix` = dim(efa_data_complete),
`After low-MSA drops` = dim(efa_data_clean)
) [,1] [,2]
Raw EFA matrix 165 45
Complete-case EFA matrix 165 45
After low-MSA drops 165 41
sum(is.na(efa_data))[1] 0
sort(colSums(is.na(efa_data)), decreasing = TRUE) %>% head(10) X1A X1B X1C X1D_i X1D_ii X1E X2A_scaled
0 0 0 0 0 0 0
X2B_scaled X2C_scaled X2D_scaled
0 0 0
Interpretation:
- The EFA sample size is the complete-case
Nafter listwise deletion. - The
After low-MSA dropsmatrix should be108 x 41if the same filtering is reproduced. - Missingness matters because it can change which countries enter the factor model.
kmo_resultKaiser-Meyer-Olkin factor adequacy
Call: KMO(r = cor_matrix)
Overall MSA = 0.88
MSA for each item =
X1A X1B X1C X1D_i X1D_ii X1E X2A_scaled
0.83 0.89 0.85 0.73 0.66 0.84 0.91
X2B_scaled X2C_scaled X2D_scaled X2E_scaled X2F_scaled X2G_scaled X2H_scaled
0.91 0.94 0.94 0.93 0.95 0.86 0.94
X3A X3B X3C X3D X4A_i X4A_ii X4A_iii
0.73 0.78 0.70 0.88 0.87 0.89 0.48
X4B_i X4B_ii X4C X4D_i X4D_ii X4D_iii X4D_iv
0.94 0.94 0.82 0.93 0.81 0.83 0.94
X5A_i X5A_ii X5A_iii X5B_i X5B_ii X5B_iii X5B_iv
0.81 0.84 0.67 0.77 0.78 0.83 0.54
X5B_v X5B_vi X5B_vii X5C_i X5C_ii X5C_iii X5C_iv
0.67 0.48 0.92 0.87 0.94 0.94 0.94
X5D_i X5D_ii X5D_iii
0.93 0.74 0.91
bartlett_result$chisq
[1] 5840.759
$p.value
[1] 0
$df
[1] 990
Interpretation:
KMOassesses whether the correlation structure is suitable for factor analysis (higher is better).- Bartlett’s test checks whether the correlation matrix is significantly different from identity (small p-value supports factor analysis).
16.2 Parallel Analysis and VSS (Factor Count Guidance)
parallel_results <- fa.parallel(
efa_data_clean,
fa = "fa",
fm = "ml",
n.iter = 100
)Parallel analysis suggests that the number of factors = 6 and the number of components = NA
scree(efa_data_clean, factors = TRUE)vss_results <- VSS(
efa_data_clean,
n = 10,
rotate = "varimax",
fm = "ml"
)# Print object classes / summaries so the notebook records what was estimated
class(parallel_results)[1] "psych" "parallel"
class(vss_results)[1] "psych" "vss"
Interpretation:
- Parallel analysis and scree plots are heuristics for the number of factors.
- They support model selection but should be considered alongside interpretability and theory.
16.3 Fit the 4-Factor EFA Model
efa_4factor <- fa(
efa_data_clean,
nfactors = 4,
rotate = "varimax",
fm = "ml",
scores = "regression"
)
print(efa_4factor, cut = 0.30, sort = TRUE)Factor Analysis using method = ml
Call: fa(r = efa_data_clean, nfactors = 4, rotate = "varimax", scores = "regression",
fm = "ml")
Standardized loadings (pattern matrix) based upon correlation matrix
item ML2 ML3 ML4 ML1 h2 u2 com
X2E_scaled 11 0.79 0.46 0.903 0.097 1.8
X2B_scaled 8 0.75 0.37 0.46 0.916 0.084 2.2
X2A_scaled 7 0.74 0.52 0.886 0.114 2.1
X2D_scaled 10 0.73 0.36 0.746 0.254 1.8
X5C_iii 37 0.72 0.48 0.762 0.238 1.8
X2H_scaled 14 0.67 0.53 0.765 0.235 2.1
X1A 1 -0.63 0.472 0.528 1.4
X2C_scaled 9 0.61 0.51 0.52 0.903 0.097 2.9
X2F_scaled 12 0.60 0.50 0.33 0.750 0.250 2.8
X5C_ii 36 0.51 0.47 0.43 0.671 0.329 3.0
X1B 2 -0.46 -0.30 0.338 0.662 2.2
X4B_ii 22 0.45 0.40 0.39 0.518 0.482 3.0
X1D_i 4 -0.36 -0.33 0.307 0.693 2.9
X4B_i 21 0.32 0.62 0.38 0.638 0.362 2.2
X5C_iv 38 0.39 0.60 0.555 0.445 2.1
X4D_i 24 0.42 0.59 0.44 0.726 0.274 2.7
X4A_i 19 0.54 0.348 0.652 1.4
X3D 18 0.51 0.315 0.685 1.5
X5B_i 30 0.50 0.330 0.670 1.6
X4A_ii 20 0.33 0.50 0.403 0.597 2.2
X5C_i 35 0.31 0.50 0.368 0.632 1.9
X5B_iii 32 0.42 0.205 0.795 1.3
X2G_scaled 13 0.31 0.42 0.301 0.699 2.2
X4D_ii 25 0.39 0.234 0.766 2.0
X5D_ii 40 0.111 0.889 1.8
X3A 15 0.083 0.917 1.6
X5A_ii 29 0.075 0.925 2.5
X3C 17 0.088 0.912 3.0
X1E 6 0.79 0.643 0.357 1.1
X5D_iii 41 0.44 0.72 0.765 0.235 1.9
X4D_iv 27 0.34 0.55 0.64 0.827 0.173 2.5
X5D_i 39 0.37 0.48 0.56 0.700 0.300 2.9
X5B_vii 34 0.44 0.50 0.458 0.542 2.1
X3B 16 0.48 0.242 0.758 1.1
X1C 3 0.40 0.201 0.799 1.6
X5A_i 28 0.39 0.177 0.823 1.4
X4D_iii 26 0.36 0.194 0.806 2.1
X4C 23 0.35 0.199 0.801 2.4
X1D_ii 5 -0.33 0.271 0.729 3.3
X5B_ii 31 0.40 0.89 0.995 0.005 1.5
X5B_v 33 0.82 0.755 0.245 1.2
ML2 ML3 ML4 ML1
SS loadings 6.59 5.98 5.66 1.91
Proportion Var 0.16 0.15 0.14 0.05
Cumulative Var 0.16 0.31 0.44 0.49
Proportion Explained 0.33 0.30 0.28 0.10
Cumulative Proportion 0.33 0.62 0.90 1.00
Mean item complexity = 2.1
Test of the hypothesis that 4 factors are sufficient.
df null model = 820 with the objective function = 36.91 with Chi Square = 5518.29
df of the model are 662 and the objective function was 10.84
The root mean square of the residuals (RMSR) is 0.07
The df corrected root mean square of the residuals is 0.08
The harmonic n.obs is 165 with the empirical chi square 1339.97 with prob < 3e-48
The total n.obs was 165 with Likelihood Chi Square = 1592.1 with prob < 2.3e-78
Tucker Lewis Index of factoring reliability = 0.75
RMSEA index = 0.092 and the 90 % confidence intervals are 0.087 0.098
BIC = -1788.03
Fit based upon off diagonal values = 0.96
Measures of factor score adequacy
ML2 ML3 ML4 ML1
Correlation of (regression) scores with factors 0.96 0.94 0.93 0.99
Multiple R square of scores with factors 0.91 0.88 0.87 0.98
Minimum correlation of possible factor scores 0.83 0.76 0.74 0.96
efa_fit_table <- tibble::tibble(
Metric = c("TLI", "RMSEA", "RMSEA CI lower", "RMSEA CI upper"),
Value = c(efa_4factor$TLI, efa_4factor$RMSEA[1], efa_4factor$RMSEA[2], efa_4factor$RMSEA[3])
) %>%
mutate(Value = round(Value, 3))
variance_table <- data.frame(
Factor = paste0("Factor", 1:4),
SS_Loadings = efa_4factor$Vaccounted[1, ],
Proportion_Variance = efa_4factor$Vaccounted[2, ],
Cumulative_Variance = efa_4factor$Vaccounted[3, ]
) %>%
mutate(across(where(is.numeric), ~ round(.x, 3)))
efa_fit_table# A tibble: 4 × 2
Metric Value
<chr> <dbl>
1 TLI 0.75
2 RMSEA 0.092
3 RMSEA CI lower 0.087
4 RMSEA CI upper 0.098
knitr::kable(variance_table, caption = "EFA variance explained (4-factor solution)") %>%
kableExtra::kable_styling(full_width = FALSE)| Factor | SS_Loadings | Proportion_Variance | Cumulative_Variance | |
|---|---|---|---|---|
| ML2 | Factor1 | 6.592 | 0.161 | 0.161 |
| ML3 | Factor2 | 5.975 | 0.146 | 0.307 |
| ML4 | Factor3 | 5.664 | 0.138 | 0.445 |
| ML1 | Factor4 | 1.915 | 0.047 | 0.491 |
Interpretation:
TLIandRMSEAsummarize how well the 4-factor solution reproduces the correlation structure.- The variance shares are the basis for the empirical factor weights used in the ranking exercise.
loadings_df <- as.data.frame(unclass(efa_4factor$loadings))
loadings_df$Variable_Code <- rownames(loadings_df)
loading_cols <- grep("^ML", names(loadings_df), value = TRUE)
loadings_df$Max_Loading <- apply(abs(loadings_df[, loading_cols, drop = FALSE]), 1, max)
loadings_df$Primary_Factor <- apply(abs(loadings_df[, loading_cols, drop = FALSE]), 1, which.max)
loadings_df$Primary_Loading_Signed <- mapply(
function(i, f) loadings_df[i, loading_cols[f]],
i = seq_len(nrow(loadings_df)),
f = loadings_df$Primary_Factor
)
loadings_df$Variable <- ifelse(
loadings_df$Variable_Code %in% names(efa_var_labels),
paste0(loadings_df$Variable_Code, ": ", efa_var_labels[loadings_df$Variable_Code]),
loadings_df$Variable_Code
)
primary_loadings <- loadings_df %>%
filter(Max_Loading > 0.30) %>%
arrange(Primary_Factor, desc(Max_Loading))
knitr::kable(primary_loadings, digits = 3, caption = "Primary factor assignments (loadings > 0.30)") %>%
kableExtra::kable_styling(full_width = FALSE, font_size = 10)| ML2 | ML3 | ML4 | ML1 | Variable_Code | Max_Loading | Primary_Factor | Primary_Loading_Signed | Variable | |
|---|---|---|---|---|---|---|---|---|---|
| X2E_scaled | 0.794 | 0.205 | 0.463 | 0.126 | X2E_scaled | 0.794 | 1 | 0.794 | X2E_scaled: Legal integrity* |
| X2B_scaled | 0.748 | 0.367 | 0.458 | 0.111 | X2B_scaled | 0.748 | 1 | 0.748 | X2B_scaled: Impartial courts* |
| X2A_scaled | 0.742 | 0.224 | 0.522 | 0.117 | X2A_scaled | 0.742 | 1 | 0.742 | X2A_scaled: Judicial independence* |
| X2D_scaled | 0.730 | 0.249 | 0.364 | 0.137 | X2D_scaled | 0.730 | 1 | 0.730 | X2D_scaled: Military interference* |
| X5C_iii | 0.717 | 0.079 | 0.479 | 0.117 | X5C_iii | 0.717 | 1 | 0.717 | X5C_iii: Impartial public administration |
| X2H_scaled | 0.668 | 0.525 | 0.179 | 0.106 | X2H_scaled | 0.668 | 1 | 0.668 | X2H_scaled: Police reliability* |
| X1A | -0.632 | -0.227 | 0.141 | -0.028 | X1A | 0.632 | 1 | -0.632 | X1A: Government consumption |
| X2C_scaled | 0.609 | 0.509 | 0.523 | 0.016 | X2C_scaled | 0.609 | 1 | 0.609 | X2C_scaled: Property rights* |
| X2F_scaled | 0.601 | 0.495 | 0.335 | 0.177 | X2F_scaled | 0.601 | 1 | 0.601 | X2F_scaled: Contract enforcement* |
| X5C_ii | 0.509 | 0.468 | 0.434 | -0.069 | X5C_ii | 0.509 | 1 | 0.509 | X5C_ii: Bureaucracy costs |
| X1B | -0.457 | -0.132 | -0.300 | -0.147 | X1B | 0.457 | 1 | -0.457 | X1B: Transfers and subsidies |
| X4B_ii | 0.453 | 0.395 | 0.391 | 0.058 | X4B_ii | 0.453 | 1 | 0.453 | X4B_ii: Trade compliance costs |
| X1D_i | -0.362 | 0.251 | -0.329 | -0.069 | X1D_i | 0.362 | 1 | -0.362 | X1D_i: Top marginal income tax rate |
| X4B_i | 0.318 | 0.624 | 0.384 | -0.007 | X4B_i | 0.624 | 2 | 0.624 | X4B_i: Non-tariff barriers |
| X5C_iv | 0.392 | 0.595 | 0.156 | 0.152 | X5C_iv | 0.595 | 2 | 0.595 | X5C_iv: Tax compliance |
| X4D_i | 0.422 | 0.592 | 0.444 | 0.030 | X4D_i | 0.592 | 2 | 0.592 | X4D_i: Financial openness |
| X4A_i | 0.154 | 0.537 | 0.187 | 0.009 | X4A_i | 0.537 | 2 | 0.537 | X4A_i: Trade tax revenue |
| X3D | 0.215 | 0.505 | 0.119 | 0.006 | X3D | 0.505 | 2 | 0.505 | X3D: Foreign currency accounts |
| X5B_i | 0.085 | 0.502 | 0.061 | 0.258 | X5B_i | 0.502 | 2 | 0.502 | X5B_i: Hiring regulations and minimum wage |
| X4A_ii | 0.326 | 0.499 | 0.199 | 0.086 | X4A_ii | 0.499 | 2 | 0.499 | X4A_ii: Mean tariff rate |
| X5C_i | 0.312 | 0.499 | -0.121 | 0.086 | X5C_i | 0.499 | 2 | 0.499 | X5C_i: Regulatory burden |
| X5B_iii | -0.090 | 0.424 | 0.033 | 0.129 | X5B_iii | 0.424 | 2 | 0.424 | X5B_iii: Centralized collective bargaining |
| X2G_scaled | 0.307 | 0.422 | 0.167 | -0.004 | X2G_scaled | 0.422 | 2 | 0.422 | X2G_scaled: Real property restrictions* |
| X4D_ii | 0.143 | 0.389 | 0.250 | 0.015 | X4D_ii | 0.389 | 2 | 0.389 | X4D_ii: Capital controls |
| X1E | 0.103 | -0.106 | 0.786 | 0.062 | X1E | 0.786 | 3 | 0.786 | X1E: State ownership of assets |
| X5D_iii | 0.144 | 0.436 | 0.721 | 0.186 | X5D_iii | 0.721 | 3 | 0.721 | X5D_iii: Distortion of business environment |
| X4D_iv | 0.336 | 0.555 | 0.636 | 0.044 | X4D_iv | 0.636 | 3 | 0.636 | X4D_iv: Protection of foreign assets |
| X5D_i | 0.368 | 0.484 | 0.557 | 0.144 | X5D_i | 0.557 | 3 | 0.557 | X5D_i: Market openness |
| X5B_vii | 0.117 | 0.438 | 0.501 | 0.040 | X5B_vii | 0.501 | 3 | 0.501 | X5B_vii: Foreign labor |
| X3B | 0.074 | 0.055 | 0.479 | 0.070 | X3B | 0.479 | 3 | 0.479 | X3B: Inflation volatility |
| X1C | 0.175 | 0.113 | 0.397 | -0.014 | X1C | 0.397 | 3 | 0.397 | X1C: Government investment |
| X5A_i | 0.094 | 0.105 | 0.386 | 0.087 | X5A_i | 0.386 | 3 | 0.386 | X5A_i: Ownership of banks |
| X4D_iii | 0.191 | 0.142 | 0.358 | -0.098 | X4D_iii | 0.358 | 3 | 0.358 | X4D_iii: Freedom of foreigners to visit |
| X4C | 0.124 | 0.181 | 0.348 | -0.173 | X4C | 0.348 | 3 | 0.348 | X4C: Black market exchange rates |
| X1D_ii | -0.196 | 0.292 | -0.330 | -0.197 | X1D_ii | 0.330 | 3 | -0.330 | X1D_ii: Top marginal income and payroll tax rate |
| X5B_ii | 0.152 | 0.397 | 0.155 | 0.889 | X5B_ii | 0.889 | 4 | 0.889 | X5B_ii: Hiring and firing regulations |
| X5B_v | 0.217 | 0.174 | 0.039 | 0.822 | X5B_v | 0.822 | 4 | 0.822 | X5B_v: Mandated cost of worker dismissal |
Interpretation:
- This table is for substantive interpretation of the empirical dimensions.
- Cross-loadings or weak loadings should be reported as limitations in the paper.
# Paper-facing EFA loading heatmap (exported for manuscript inclusion).
loadings_long <- loadings_df %>%
select(Variable, all_of(loading_cols), Primary_Factor, Max_Loading) %>%
tidyr::pivot_longer(cols = all_of(loading_cols), names_to = "Factor", values_to = "Loading")
factor_labels <- setNames(paste0("Factor ", seq_along(loading_cols)), loading_cols)
loadings_long <- loadings_long %>%
mutate(
Factor = unname(factor_labels[Factor]),
Factor = factor(Factor, levels = unname(factor_labels)),
Variable = factor(
Variable,
levels = rev(primary_loadings %>% arrange(Primary_Factor, desc(Max_Loading)) %>% pull(Variable))
)
) %>%
filter(!is.na(Variable))
efa_loading_heatmap_v7 <- ggplot(loadings_long, aes(x = Factor, y = Variable, fill = Loading)) +
geom_tile(color = "grey85", linewidth = 0.2) +
geom_text(
aes(label = ifelse(abs(Loading) >= 0.30, sprintf("%.2f", Loading), "")),
size = 2.5
) +
scale_fill_gradient2(
low = "#d73027",
mid = "white",
high = "#1a9850",
midpoint = 0,
limits = c(-1, 1),
oob = scales::squish
) +
labs(
title = "4-Factor EFA Loading Heatmap",
subtitle = "Rows sorted by primary loading; labels shown for |loading| >= 0.30",
x = NULL, y = NULL, fill = "Loading"
) +
theme_minimal(base_size = 10) +
theme(
panel.grid = element_blank(),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 7),
plot.title = element_text(face = "bold"),
legend.position = "right"
)
dir.create("../paper_draft/images", recursive = TRUE, showWarnings = FALSE)
ggsave("../paper_draft/images/efa_loading_heatmap.png", efa_loading_heatmap_v7, width = 8.8, height = 11.5, dpi = 300)
write.csv(primary_loadings, "../data/clean/efa_primary_loadings_v7.csv", row.names = FALSE)
efa_loading_heatmap_v716.4 EFA in lavaan
Why the main EFA above uses psych::fa():
psychis more convenient for exploratory workflows (parallel analysis, VSS, rotated loadings, quick factor scores).lavaanis used later for CFA because it is stronger for confirmatory model comparison and SEM-style fit reporting.
Why add lavaan EFA:
- It provides a robustness check using the same Component/Subcomponent matrix inside the same modeling framework used for CFA.
- It helps compare whether the exploratory loading structure is broadly similar across implementations.
# Build a 4-factor exploratory block in lavaan using the same cleaned EFA matrix.
efa_vars_lav <- colnames(efa_data_clean)
lavaan_efa_syntax <- paste(
c(
paste0('efa("efa1")*F1 =~ ', paste(efa_vars_lav, collapse = " + ")),
paste0('efa("efa1")*F2 =~ ', paste(efa_vars_lav, collapse = " + ")),
paste0('efa("efa1")*F3 =~ ', paste(efa_vars_lav, collapse = " + ")),
paste0('efa("efa1")*F4 =~ ', paste(efa_vars_lav, collapse = " + "))
),
collapse = "\n"
)
lavaan_efa_fit <- with_detectcores_fallback(
lavaan::sem(
model = lavaan_efa_syntax,
data = efa_data_clean,
std.lv = TRUE,
rotation = "varimax",
missing = "fiml",
ncpus = 1L
)
)Warning: lavaan->lav_object_post_check():
some estimated ov variances are negative
lavaan_efa_fitlavaan 0.6-19 ended normally after 134 iterations
Estimator ML
Optimization method NLMINB
Number of model parameters 246
Row rank of the constraints matrix 6
Rotation method VARIMAX ORTHOGONAL
Rotation algorithm (rstarts) GPA (30)
Standardized metric TRUE
Row weights Kaiser
Number of observations 165
Number of missing patterns 1
Model Test User Model:
Test statistic 1761.685
Degrees of freedom 662
P-value (Chi-square) 0.000
lavaan_efa_fit_measures <- fitMeasures(
lavaan_efa_fit,
c("chisq", "df", "pvalue", "cfi", "tli", "rmsea", "rmsea.ci.lower", "rmsea.ci.upper", "srmr", "aic", "bic")
)
round(lavaan_efa_fit_measures, 3) chisq df pvalue cfi tli
1761.685 662.000 0.000 0.791 0.742
rmsea rmsea.ci.lower rmsea.ci.upper srmr aic
0.100 0.095 0.106 0.068 25441.310
bic
26186.737
lavaan_efa_loadings <- standardizedSolution(lavaan_efa_fit) %>%
dplyr::filter(op == "=~") %>%
dplyr::select(Factor = lhs, Variable = rhs, Std_Loading = est.std) %>%
dplyr::mutate(Abs_Loading = abs(Std_Loading)) %>%
dplyr::arrange(Factor, dplyr::desc(Abs_Loading))
knitr::kable(
dplyr::filter(lavaan_efa_loadings, Abs_Loading >= 0.30),
digits = 3,
caption = "lavaan EFA standardized loadings (|loading| >= 0.30)"
) %>%
kableExtra::kable_styling(full_width = FALSE, font_size = 10)| Factor | Variable | Std_Loading | Abs_Loading |
|---|---|---|---|
| F1 | X2E_scaled | 0.792 | 0.792 |
| F1 | X2B_scaled | 0.742 | 0.742 |
| F1 | X2A_scaled | 0.742 | 0.742 |
| F1 | X2D_scaled | 0.723 | 0.723 |
| F1 | X5C_iii | 0.716 | 0.716 |
| F1 | X2H_scaled | 0.657 | 0.657 |
| F1 | X1A | -0.613 | 0.613 |
| F1 | X2F_scaled | 0.596 | 0.596 |
| F1 | X2C_scaled | 0.590 | 0.590 |
| F1 | X5C_ii | 0.491 | 0.491 |
| F1 | X1B | -0.458 | 0.458 |
| F1 | X4B_ii | 0.441 | 0.441 |
| F1 | X4D_i | 0.399 | 0.399 |
| F1 | X5C_iv | 0.381 | 0.381 |
| F1 | X1D_i | -0.374 | 0.374 |
| F1 | X5D_i | 0.363 | 0.363 |
| F1 | X4D_iv | 0.320 | 0.320 |
| F1 | X4A_ii | 0.302 | 0.302 |
| F2 | X4B_i | 0.648 | 0.648 |
| F2 | X4D_i | 0.618 | 0.618 |
| F2 | X5C_iv | 0.593 | 0.593 |
| F2 | X4D_iv | 0.576 | 0.576 |
| F2 | X4A_i | 0.548 | 0.548 |
| F2 | X2H_scaled | 0.546 | 0.546 |
| F2 | X2C_scaled | 0.537 | 0.537 |
| F2 | X3D | 0.517 | 0.517 |
| F2 | X2F_scaled | 0.513 | 0.513 |
| F2 | X4A_ii | 0.507 | 0.507 |
| F2 | X5C_i | 0.493 | 0.493 |
| F2 | X5C_ii | 0.492 | 0.492 |
| F2 | X5D_i | 0.492 | 0.492 |
| F2 | X5B_i | 0.475 | 0.475 |
| F2 | X5B_vii | 0.449 | 0.449 |
| F2 | X5D_iii | 0.439 | 0.439 |
| F2 | X2G_scaled | 0.431 | 0.431 |
| F2 | X4B_ii | 0.413 | 0.413 |
| F2 | X5B_iii | 0.408 | 0.408 |
| F2 | X4D_ii | 0.402 | 0.402 |
| F2 | X2B_scaled | 0.390 | 0.390 |
| F2 | X5B_ii | 0.355 | 0.355 |
| F3 | X1E | 0.800 | 0.800 |
| F3 | X5D_iii | 0.701 | 0.701 |
| F3 | X4D_iv | 0.625 | 0.625 |
| F3 | X5D_i | 0.539 | 0.539 |
| F3 | X2A_scaled | 0.512 | 0.512 |
| F3 | X2C_scaled | 0.508 | 0.508 |
| F3 | X5B_vii | 0.494 | 0.494 |
| F3 | X3B | 0.483 | 0.483 |
| F3 | X5C_iii | 0.474 | 0.474 |
| F3 | X2E_scaled | 0.454 | 0.454 |
| F3 | X2B_scaled | 0.446 | 0.446 |
| F3 | X4D_i | 0.435 | 0.435 |
| F3 | X5C_ii | 0.424 | 0.424 |
| F3 | X5A_i | 0.399 | 0.399 |
| F3 | X1C | 0.398 | 0.398 |
| F3 | X4B_ii | 0.382 | 0.382 |
| F3 | X4B_i | 0.373 | 0.373 |
| F3 | X4D_iii | 0.360 | 0.360 |
| F3 | X2D_scaled | 0.359 | 0.359 |
| F3 | X4C | 0.343 | 0.343 |
| F3 | X1D_ii | -0.328 | 0.328 |
| F3 | X2F_scaled | 0.323 | 0.323 |
| F3 | X1D_i | -0.322 | 0.322 |
| F4 | X5B_ii | 1.120 | 1.120 |
| F4 | X5B_v | 0.680 | 0.680 |
Interpretation:
- Broad agreement with
psych::fa()strengthens the claim that the factor structure is not an artifact of a single package. - Differences can arise from estimation/rotation details and do not automatically invalidate either solution.
- The main empirical weighting pipeline below uses equal weights within assigned factor-variable sets (with loading signs retained), then equal weights across factors.
17 Empirical Weights and Rank Sensitivity
# Complete-case countries used by EFA
complete_countries <- df_2022_manual_subcomponent %>%
slice(which(complete.cases(efa_data))) %>%
pull(Countries)
# Base factor scores from psych::fa() (non-equal / variance-weighted model)
factor_scores <- as.data.frame(efa_4factor$scores)
colnames(factor_scores) <- c("Factor1", "Factor2", "Factor3", "Factor4")
factor_scores$Countries <- complete_countries
# Model A: variance-weighted factors (legacy / non-equal model)
variance_weights <- efa_4factor$Vaccounted[2, ] / sum(efa_4factor$Vaccounted[2, ])
factor_scores <- factor_scores %>%
mutate(
EFI_Empirical_Weighted =
variance_weights[1] * Factor1 +
variance_weights[2] * Factor2 +
variance_weights[3] * Factor3 +
variance_weights[4] * Factor4,
Rank_Empirical_Weighted = rank(-EFI_Empirical_Weighted, ties.method = "min"),
EFI_Empirical_Weighted_Scaled = as.numeric(5 + 2.5 * scale(EFI_Empirical_Weighted))
)
# Build forced equal-within-factor scores from assigned variables:
# - each variable is assigned to its max-|loading| factor
# - within each factor, assigned variables get equal weights
# - loading signs are retained (+/-)
# - factor weights are variance-based (can differ across factors)
z_data <- as.data.frame(scale(efa_data_clean))
assigned <- primary_loadings %>%
transmute(
Variable,
Variable_Code,
Primary_Factor,
Primary_Loading_Signed
)
equal_within_scores <- tibble::tibble(Countries = complete_countries)
for (f in 1:4) {
sub_f <- assigned %>% filter(Primary_Factor == f)
vars_f <- sub_f$Variable_Code
if (length(vars_f) == 0) {
equal_within_scores[[paste0("EqFactor", f)]] <- NA_real_
} else {
mat_f <- as.matrix(z_data[, vars_f, drop = FALSE])
signed_equal_w <- sign(sub_f$Primary_Loading_Signed) * (1 / nrow(sub_f))
equal_within_scores[[paste0("EqFactor", f)]] <- as.numeric(mat_f %*% signed_equal_w)
}
}
factor_scores <- factor_scores %>%
left_join(equal_within_scores, by = "Countries") %>%
mutate(
EFI_Factor_ForcedEqualWithin =
variance_weights[1] * EqFactor1 +
variance_weights[2] * EqFactor2 +
variance_weights[3] * EqFactor3 +
variance_weights[4] * EqFactor4,
Rank_Factor_ForcedEqualWithin = rank(-EFI_Factor_ForcedEqualWithin, ties.method = "min"),
EFI_Factor_ForcedEqualWithin_Scaled = as.numeric(5 + 2.5 * scale(EFI_Factor_ForcedEqualWithin)),
# keep both models
EFI_Empirical_Weighted_Variance = EFI_Empirical_Weighted,
Rank_Empirical_Weighted_Variance = Rank_Empirical_Weighted,
EFI_Empirical_Weighted_Scaled_Variance = EFI_Empirical_Weighted_Scaled,
# aliases consumed by downstream tables/plots: map to forced-equal-within model
EFI_Empirical_Weighted = EFI_Factor_ForcedEqualWithin,
Rank_Empirical_Weighted = Rank_Factor_ForcedEqualWithin,
EFI_Empirical_Weighted_Scaled = EFI_Factor_ForcedEqualWithin_Scaled
)
comparison <- df_2022_manual_subcomponent %>%
filter(Countries %in% factor_scores$Countries) %>%
select(Countries, Rank, `Economic Freedom Summary Index`) %>%
left_join(
factor_scores %>%
select(
Countries,
Rank_Empirical_Weighted,
Rank_Empirical_Weighted_Variance,
Rank_Factor_ForcedEqualWithin,
EFI_Empirical_Weighted_Scaled,
EFI_Factor_ForcedEqualWithin_Scaled,
EFI_Empirical_Weighted_Scaled_Variance
),
by = "Countries"
) %>%
mutate(
Rank_Change_Factor_ForcedEqualWithin = Rank - Rank_Factor_ForcedEqualWithin,
Rank_Change_Weighted = Rank - Rank_Factor_ForcedEqualWithin,
Rank_Change_Weighted_Variance = Rank - Rank_Empirical_Weighted_Variance
) %>%
arrange(Rank)
# Save outputs for the paper draft
write.csv(comparison, "../data/clean/efa_comparison_results_v10.csv", row.names = FALSE)
write.csv(factor_scores, "../data/clean/efa_factor_scores_v10.csv", row.names = FALSE)weights_tbl <- tibble::tibble(
Factor = paste0("Factor", seq_along(variance_weights)),
Forced_EqualWithin_Model = scales::percent(variance_weights, accuracy = 0.1),
Variance_Weighted_Model = scales::percent(variance_weights, accuracy = 0.1)
)
knitr::kable(weights_tbl, caption = "Factor-level weights (both models use variance-based factor weights)") %>%
kableExtra::kable_styling(full_width = FALSE)| Factor | Forced_EqualWithin_Model | Variance_Weighted_Model |
|---|---|---|
| Factor1 | 32.7% | 32.7% |
| Factor2 | 29.7% | 29.7% |
| Factor3 | 28.1% | 28.1% |
| Factor4 | 9.5% | 9.5% |
factor_variable_weights <- assigned %>%
group_by(Primary_Factor) %>%
mutate(
Loading_Based_Weight = sign(Primary_Loading_Signed) * abs(Primary_Loading_Signed) / sum(abs(Primary_Loading_Signed)),
Forced_Equal_Weight = sign(Primary_Loading_Signed) * (1 / n())
) %>%
ungroup()
plot_factor_weights <- factor_variable_weights %>%
select(Variable, Primary_Factor, Loading_Based_Weight, Forced_Equal_Weight) %>%
tidyr::pivot_longer(
cols = c(Loading_Based_Weight, Forced_Equal_Weight),
names_to = "Scheme",
values_to = "Weight"
) %>%
mutate(
Scheme = unname(c(
Loading_Based_Weight = "Within-factor loading-based weights",
Forced_Equal_Weight = "Within-factor forced equal weights"
)[Scheme]),
Factor = paste0("Factor ", Primary_Factor)
)
ggplot(plot_factor_weights, aes(x = reorder(Variable, abs(Weight)), y = Weight, fill = Weight > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_grid(Scheme ~ Factor, scales = "free_y") +
scale_fill_manual(values = c("TRUE" = "#1a9850", "FALSE" = "#d73027")) +
labs(
title = "Factor-variable weights by scheme",
subtitle = "Variables assigned to their primary factor (max |loading|); sign retained",
x = NULL,
y = "Signed within-factor weight"
) +
theme_minimal(base_size = 9)loading_based_table <- factor_variable_weights %>%
transmute(
Factor = paste0("Factor ", Primary_Factor),
Variable,
Loading = Primary_Loading_Signed,
Weight = Loading_Based_Weight
) %>%
arrange(Factor, desc(abs(Loading)))
forced_equal_table <- factor_variable_weights %>%
transmute(
Factor = paste0("Factor ", Primary_Factor),
Variable,
Loading = Primary_Loading_Signed,
Weight = Forced_Equal_Weight
) %>%
arrange(Factor, desc(abs(Loading)))
knitr::kable(
loading_based_table,
digits = 4,
caption = "Within-factor variable weights: loading-based scheme"
) %>%
kableExtra::kable_styling(full_width = FALSE, font_size = 9)| Factor | Variable | Loading | Weight |
|---|---|---|---|
| Factor 1 | X2E_scaled: Legal integrity* | 0.7940 | 0.0990 |
| Factor 1 | X2B_scaled: Impartial courts* | 0.7477 | 0.0932 |
| Factor 1 | X2A_scaled: Judicial independence* | 0.7421 | 0.0925 |
| Factor 1 | X2D_scaled: Military interference* | 0.7300 | 0.0910 |
| Factor 1 | X5C_iii: Impartial public administration | 0.7166 | 0.0893 |
| Factor 1 | X2H_scaled: Police reliability* | 0.6678 | 0.0833 |
| Factor 1 | X1A: Government consumption | -0.6320 | -0.0788 |
| Factor 1 | X2C_scaled: Property rights* | 0.6086 | 0.0759 |
| Factor 1 | X2F_scaled: Contract enforcement* | 0.6011 | 0.0749 |
| Factor 1 | X5C_ii: Bureaucracy costs | 0.5088 | 0.0634 |
| Factor 1 | X1B: Transfers and subsidies | -0.4571 | -0.0570 |
| Factor 1 | X4B_ii: Trade compliance costs | 0.4529 | 0.0565 |
| Factor 1 | X1D_i: Top marginal income tax rate | -0.3618 | -0.0451 |
| Factor 2 | X4B_i: Non-tariff barriers | 0.6237 | 0.1116 |
| Factor 2 | X5C_iv: Tax compliance | 0.5951 | 0.1065 |
| Factor 2 | X4D_i: Financial openness | 0.5921 | 0.1059 |
| Factor 2 | X4A_i: Trade tax revenue | 0.5374 | 0.0962 |
| Factor 2 | X3D: Foreign currency accounts | 0.5050 | 0.0904 |
| Factor 2 | X5B_i: Hiring regulations and minimum wage | 0.5023 | 0.0899 |
| Factor 2 | X4A_ii: Mean tariff rate | 0.4990 | 0.0893 |
| Factor 2 | X5C_i: Regulatory burden | 0.4990 | 0.0893 |
| Factor 2 | X5B_iii: Centralized collective bargaining | 0.4237 | 0.0758 |
| Factor 2 | X2G_scaled: Real property restrictions* | 0.4224 | 0.0756 |
| Factor 2 | X4D_ii: Capital controls | 0.3888 | 0.0696 |
| Factor 3 | X1E: State ownership of assets | 0.7858 | 0.1429 |
| Factor 3 | X5D_iii: Distortion of business environment | 0.7210 | 0.1311 |
| Factor 3 | X4D_iv: Protection of foreign assets | 0.6359 | 0.1156 |
| Factor 3 | X5D_i: Market openness | 0.5566 | 0.1012 |
| Factor 3 | X5B_vii: Foreign labor | 0.5012 | 0.0912 |
| Factor 3 | X3B: Inflation volatility | 0.4787 | 0.0871 |
| Factor 3 | X1C: Government investment | 0.3973 | 0.0723 |
| Factor 3 | X5A_i: Ownership of banks | 0.3864 | 0.0703 |
| Factor 3 | X4D_iii: Freedom of foreigners to visit | 0.3579 | 0.0651 |
| Factor 3 | X4C: Black market exchange rates | 0.3476 | 0.0632 |
| Factor 3 | X1D_ii: Top marginal income and payroll tax rate | -0.3300 | -0.0600 |
| Factor 4 | X5B_ii: Hiring and firing regulations | 0.8889 | 0.5194 |
| Factor 4 | X5B_v: Mandated cost of worker dismissal | 0.8224 | 0.4806 |
knitr::kable(
forced_equal_table,
digits = 4,
caption = "Within-factor variable weights: forced-equal scheme (sign retained)"
) %>%
kableExtra::kable_styling(full_width = FALSE, font_size = 9)| Factor | Variable | Loading | Weight |
|---|---|---|---|
| Factor 1 | X2E_scaled: Legal integrity* | 0.7940 | 0.0769 |
| Factor 1 | X2B_scaled: Impartial courts* | 0.7477 | 0.0769 |
| Factor 1 | X2A_scaled: Judicial independence* | 0.7421 | 0.0769 |
| Factor 1 | X2D_scaled: Military interference* | 0.7300 | 0.0769 |
| Factor 1 | X5C_iii: Impartial public administration | 0.7166 | 0.0769 |
| Factor 1 | X2H_scaled: Police reliability* | 0.6678 | 0.0769 |
| Factor 1 | X1A: Government consumption | -0.6320 | -0.0769 |
| Factor 1 | X2C_scaled: Property rights* | 0.6086 | 0.0769 |
| Factor 1 | X2F_scaled: Contract enforcement* | 0.6011 | 0.0769 |
| Factor 1 | X5C_ii: Bureaucracy costs | 0.5088 | 0.0769 |
| Factor 1 | X1B: Transfers and subsidies | -0.4571 | -0.0769 |
| Factor 1 | X4B_ii: Trade compliance costs | 0.4529 | 0.0769 |
| Factor 1 | X1D_i: Top marginal income tax rate | -0.3618 | -0.0769 |
| Factor 2 | X4B_i: Non-tariff barriers | 0.6237 | 0.0909 |
| Factor 2 | X5C_iv: Tax compliance | 0.5951 | 0.0909 |
| Factor 2 | X4D_i: Financial openness | 0.5921 | 0.0909 |
| Factor 2 | X4A_i: Trade tax revenue | 0.5374 | 0.0909 |
| Factor 2 | X3D: Foreign currency accounts | 0.5050 | 0.0909 |
| Factor 2 | X5B_i: Hiring regulations and minimum wage | 0.5023 | 0.0909 |
| Factor 2 | X4A_ii: Mean tariff rate | 0.4990 | 0.0909 |
| Factor 2 | X5C_i: Regulatory burden | 0.4990 | 0.0909 |
| Factor 2 | X5B_iii: Centralized collective bargaining | 0.4237 | 0.0909 |
| Factor 2 | X2G_scaled: Real property restrictions* | 0.4224 | 0.0909 |
| Factor 2 | X4D_ii: Capital controls | 0.3888 | 0.0909 |
| Factor 3 | X1E: State ownership of assets | 0.7858 | 0.0909 |
| Factor 3 | X5D_iii: Distortion of business environment | 0.7210 | 0.0909 |
| Factor 3 | X4D_iv: Protection of foreign assets | 0.6359 | 0.0909 |
| Factor 3 | X5D_i: Market openness | 0.5566 | 0.0909 |
| Factor 3 | X5B_vii: Foreign labor | 0.5012 | 0.0909 |
| Factor 3 | X3B: Inflation volatility | 0.4787 | 0.0909 |
| Factor 3 | X1C: Government investment | 0.3973 | 0.0909 |
| Factor 3 | X5A_i: Ownership of banks | 0.3864 | 0.0909 |
| Factor 3 | X4D_iii: Freedom of foreigners to visit | 0.3579 | 0.0909 |
| Factor 3 | X4C: Black market exchange rates | 0.3476 | 0.0909 |
| Factor 3 | X1D_ii: Top marginal income and payroll tax rate | -0.3300 | -0.0909 |
| Factor 4 | X5B_ii: Hiring and firing regulations | 0.8889 | 0.5000 |
| Factor 4 | X5B_v: Mandated cost of worker dismissal | 0.8224 | 0.5000 |
dir.create("../paper_draft/tables", recursive = TRUE, showWarnings = FALSE)
write.csv(loading_based_table, "../paper_draft/tables/paper_factor_variable_weights_loading_v10.csv", row.names = FALSE)
write.csv(forced_equal_table, "../paper_draft/tables/paper_factor_variable_weights_forced_equal_v10.csv", row.names = FALSE)Interpretation:
- If the empirical weights are far from equal, that is direct evidence against the equal-weighting assumption as a neutral default.
rank_summary_table_local(comparison)# A tibble: 7 × 2
Metric Value
<chr> <chr>
1 Countries 165
2 Mean absolute rank change 14.2
3 Median absolute rank change 11
4 Countries with |rank change| > 10 89 (53.9%)
5 Countries with |rank change| > 20 39 (23.6%)
6 Spearman correlation 0.923
7 Pearson correlation 0.923
Interpretation:
- Correlation summarizes overall similarity in rankings, but the rank-change distribution captures practical ranking sensitivity.
gainers <- comparison %>%
filter(Rank_Change_Weighted > 0) %>%
arrange(desc(Rank_Change_Weighted)) %>%
slice_head(n = 10) %>%
select(Countries, Rank, Rank_Empirical_Weighted, Rank_Change_Weighted)
losers <- comparison %>%
filter(Rank_Change_Weighted < 0) %>%
arrange(Rank_Change_Weighted) %>%
slice_head(n = 10) %>%
select(Countries, Rank, Rank_Empirical_Weighted, Rank_Change_Weighted)
knitr::kable(gainers, caption = "Top 10 gainers under empirical weighting") %>%
kableExtra::kable_styling(full_width = FALSE)| Countries | Rank | Rank_Empirical_Weighted | Rank_Change_Weighted |
|---|---|---|---|
| Guyana | 136 | 85 | 51 |
| Ukraine | 150 | 99 | 51 |
| Azerbaijan | 128 | 86 | 42 |
| Türkiye | 138 | 98 | 40 |
| Fiji | 110 | 72 | 38 |
| Suriname | 141 | 103 | 38 |
| Papua New Guinea | 108 | 71 | 37 |
| Timor-Leste | 130 | 96 | 34 |
| Belgium | 42 | 11 | 31 |
| Rwanda | 92 | 63 | 29 |
knitr::kable(losers, caption = "Top 10 losers under empirical weighting") %>%
kableExtra::kable_styling(full_width = FALSE)| Countries | Rank | Rank_Empirical_Weighted | Rank_Change_Weighted |
|---|---|---|---|
| Indonesia | 64 | 119 | -55 |
| Honduras | 67 | 116 | -49 |
| Cambodia | 70 | 118 | -48 |
| Bolivia | 106 | 149 | -43 |
| Guatemala | 39 | 78 | -39 |
| Somalia | 116 | 153 | -37 |
| Dominican Republic | 49 | 84 | -35 |
| Panama | 23 | 57 | -34 |
| Nepal | 86 | 120 | -34 |
| Philippines | 59 | 91 | -32 |
# Export paper-facing ranking/weight tables close to where they are created.
paper_rank_summary_v7 <- comparison %>%
summarise(
n = n(),
spearman = cor(Rank, Rank_Empirical_Weighted, method = "spearman", use = "complete.obs"),
pearson = cor(Rank, Rank_Empirical_Weighted, method = "pearson", use = "complete.obs"),
mean_abs_change = mean(abs(Rank_Change_Weighted), na.rm = TRUE),
median_abs_change = median(abs(Rank_Change_Weighted), na.rm = TRUE),
gt10 = sum(abs(Rank_Change_Weighted) > 10, na.rm = TRUE),
gt20 = sum(abs(Rank_Change_Weighted) > 20, na.rm = TRUE)
)
paper_weights_v7 <- tibble::tibble(
Factor = paste0("Factor ", seq_along(variance_weights)),
Weight = as.numeric(variance_weights)
)
paper_summary_stats_v7 <- tibble::tibble(
Metric = c(
"Countries in rank-comparison sample",
"EFI score (original): mean",
"EFI score (original): SD",
"EFI score (original): min / max",
"Empirical weighted score (scaled): mean",
"Empirical weighted score (scaled): SD",
"Empirical weighted score (scaled): min / max",
"Absolute rank change (weighted): mean",
"Absolute rank change (weighted): median",
"Absolute rank change (weighted): max"
),
Value = c(
nrow(comparison),
round(mean(comparison$`Economic Freedom Summary Index`, na.rm = TRUE), 3),
round(sd(comparison$`Economic Freedom Summary Index`, na.rm = TRUE), 3),
sprintf("%.3f / %.3f",
min(comparison$`Economic Freedom Summary Index`, na.rm = TRUE),
max(comparison$`Economic Freedom Summary Index`, na.rm = TRUE)),
round(mean(comparison$EFI_Empirical_Weighted_Scaled, na.rm = TRUE), 3),
round(sd(comparison$EFI_Empirical_Weighted_Scaled, na.rm = TRUE), 3),
sprintf("%.3f / %.3f",
min(comparison$EFI_Empirical_Weighted_Scaled, na.rm = TRUE),
max(comparison$EFI_Empirical_Weighted_Scaled, na.rm = TRUE)),
round(mean(abs(comparison$Rank_Change_Weighted), na.rm = TRUE), 1),
round(median(abs(comparison$Rank_Change_Weighted), na.rm = TRUE), 1),
max(abs(comparison$Rank_Change_Weighted), na.rm = TRUE)
)
)
# Paper-facing exports consumed by the paper file.
dir.create("../paper_draft/tables", recursive = TRUE, showWarnings = FALSE)
write.csv(paper_rank_summary_v7, "../paper_draft/tables/paper_rank_summary_v9.csv", row.names = FALSE)
write.csv(gainers, "../paper_draft/tables/paper_top_gainers_v9.csv", row.names = FALSE)
write.csv(losers, "../paper_draft/tables/paper_top_losers_v9.csv", row.names = FALSE)
write.csv(paper_weights_v7, "../paper_draft/tables/paper_weights_v9.csv", row.names = FALSE)
write.csv(paper_summary_stats_v7, "../paper_draft/tables/paper_summary_stats_v9.csv", row.names = FALSE)
write.csv(paper_rank_summary_v7, "../paper_draft/tables/paper_rank_summary_v10.csv", row.names = FALSE)
write.csv(gainers, "../paper_draft/tables/paper_top_gainers_v10.csv", row.names = FALSE)
write.csv(losers, "../paper_draft/tables/paper_top_losers_v10.csv", row.names = FALSE)
write.csv(paper_weights_v7, "../paper_draft/tables/paper_weights_v10.csv", row.names = FALSE)
write.csv(paper_summary_stats_v7, "../paper_draft/tables/paper_summary_stats_v10.csv", row.names = FALSE)plot_rank_scatter_local(comparison)plot_rank_change_hist_local(comparison)18 CFA Comparison
cfa_model_4factor <- '
Legal =~ X2E_scaled + X2A_scaled + X2B_scaled + X2D_scaled +
X2H_scaled + X2C_scaled + X2F_scaled + X5C_iii + X1A + X1B
Trade =~ X5D_iii + X4D_iv + X5D_i + X4B_i + X4D_i + X5B_vii + X1E
Business =~ X5C_i + X5B_i
Labor =~ X5B_ii + X5B_v
'
cfa_model_5factor <- '
GovSize =~ X1A + X1B + X1C + X1D_i + X1D_ii + X1E
Legal =~ X2A_scaled + X2B_scaled + X2C_scaled + X2D_scaled +
X2E_scaled + X2F_scaled + X2G_scaled + X2H_scaled
Money =~ X3A + X3B + X3C + X3D
Trade =~ X4A_i + X4A_ii + X4B_i + X4B_ii + X4C +
X4D_i + X4D_ii + X4D_iii + X4D_iv
Regulation =~ X5A_i + X5A_ii + X5B_i + X5B_ii + X5B_iii +
X5B_v + X5B_vii + X5C_i + X5C_ii + X5C_iii +
X5C_iv + X5D_i + X5D_ii + X5D_iii
'
cfa_fit_4factor <- with_detectcores_fallback(
lavaan::cfa(cfa_model_4factor, data = efa_data_clean, std.lv = TRUE, missing = "fiml", ncpus = 1L)
)Warning: lavaan->lav_object_post_check():
some estimated ov variances are negative
cfa_fit_5factor <- with_detectcores_fallback(
lavaan::cfa(cfa_model_5factor, data = efa_data_clean, std.lv = TRUE, missing = "fiml", ncpus = 1L)
)
fit_4factor <- fitMeasures(cfa_fit_4factor, c("chisq", "df", "pvalue", "cfi", "tli", "rmsea", "rmsea.ci.lower", "rmsea.ci.upper", "srmr", "aic", "bic"))
fit_5factor <- fitMeasures(cfa_fit_5factor, c("chisq", "df", "pvalue", "cfi", "tli", "rmsea", "rmsea.ci.lower", "rmsea.ci.upper", "srmr", "aic", "bic"))
cfa_comparison_table <- data.frame(
Metric = c("CFI", "TLI", "RMSEA", "SRMR", "AIC", "BIC"),
FourFactor = c(fit_4factor["cfi"], fit_4factor["tli"], fit_4factor["rmsea"], fit_4factor["srmr"], fit_4factor["aic"], fit_4factor["bic"]),
FiveFactor = c(fit_5factor["cfi"], fit_5factor["tli"], fit_5factor["rmsea"], fit_5factor["srmr"], fit_5factor["aic"], fit_5factor["bic"]),
Better = c(
ifelse(fit_4factor["cfi"] > fit_5factor["cfi"], "4-Factor", "5-Factor"),
ifelse(fit_4factor["tli"] > fit_5factor["tli"], "4-Factor", "5-Factor"),
ifelse(fit_4factor["rmsea"] < fit_5factor["rmsea"], "4-Factor", "5-Factor"),
ifelse(fit_4factor["srmr"] < fit_5factor["srmr"], "4-Factor", "5-Factor"),
ifelse(fit_4factor["aic"] < fit_5factor["aic"], "4-Factor", "5-Factor"),
ifelse(fit_4factor["bic"] < fit_5factor["bic"], "4-Factor", "5-Factor")
)
)
cfa_anova <- tryCatch(anova(cfa_fit_4factor, cfa_fit_5factor), error = function(e) e)Warning: lavaan->lavTestLRT():
some models are based on a different set of observed variables
round(fit_4factor, 3) chisq df pvalue cfi tli
731.544 183.000 0.000 0.841 0.818
rmsea rmsea.ci.lower rmsea.ci.upper srmr aic
0.135 0.125 0.145 0.086 11936.904
bic
12151.214
round(fit_5factor, 3) chisq df pvalue cfi tli
2585.218 769.000 0.000 0.655 0.633
rmsea rmsea.ci.lower rmsea.ci.upper srmr aic
0.120 0.115 0.125 0.114 26050.844
bic
26463.935
cfa_comparison_table Metric FourFactor FiveFactor Better
cfi CFI 8.411768e-01 6.553939e-01 4-Factor
tli TLI 8.177439e-01 6.325397e-01 4-Factor
rmsea RMSEA 1.347840e-01 1.196407e-01 5-Factor
srmr SRMR 8.631649e-02 1.139616e-01 4-Factor
aic AIC 1.193690e+04 2.605084e+04 4-Factor
bic BIC 1.215121e+04 2.646393e+04 4-Factor
cfa_anova
Chi-Squared Difference Test
Df AIC BIC Chisq Chisq diff RMSEA Df diff Pr(>Chisq)
cfa_fit_4factor 183 11937 12151 731.54
cfa_fit_5factor 769 26051 26464 2585.22 1853.7 0.1145 586 < 2.2e-16
cfa_fit_4factor
cfa_fit_5factor ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Export paper-facing CFA tables close to the CFA estimation/output section.
paper_cfa_fit_v7 <- tibble::tibble(
Model = c("4-factor (empirical)", "5-factor (EFI-style)"),
CFI = c(unname(fit_4factor["cfi"]), unname(fit_5factor["cfi"])),
TLI = c(unname(fit_4factor["tli"]), unname(fit_5factor["tli"])),
RMSEA = c(unname(fit_4factor["rmsea"]), unname(fit_5factor["rmsea"])),
SRMR = c(unname(fit_4factor["srmr"]), unname(fit_5factor["srmr"])),
AIC = c(unname(fit_4factor["aic"]), unname(fit_5factor["aic"])),
BIC = c(unname(fit_4factor["bic"]), unname(fit_5factor["bic"]))
)
paper_chisq_compare_v7 <- {
if (inherits(cfa_anova, "error")) {
tibble::tibble(
Statistic = c("Chi-square difference", "Difference in df", "p-value"),
Value = c(NA_character_, NA_character_, NA_character_)
)
} else {
an <- as.data.frame(cfa_anova)
nm <- names(an)
chisq_col <- nm[grepl("Chisq[. ]diff", nm)][1]
dfdiff_col <- nm[grepl("Df[. ]diff", nm)][1]
p_col <- nm[grepl("Pr\\(>Chisq\\)", nm)][1]
last <- nrow(an)
tibble::tibble(
Statistic = c("Chi-square difference", "Difference in df", "p-value"),
Value = c(
as.character(round(an[[chisq_col]][last], 1)),
as.character(an[[dfdiff_col]][last]),
as.character(an[[p_col]][last])
)
)
}
}
dir.create("../paper_draft/tables", recursive = TRUE, showWarnings = FALSE)
write.csv(paper_cfa_fit_v7, "../paper_draft/tables/paper_cfa_fit_v7.csv", row.names = FALSE)
write.csv(paper_chisq_compare_v7, "../paper_draft/tables/paper_chisq_compare_v7.csv", row.names = FALSE)
write.csv(paper_cfa_fit_v7, "../paper_draft/tables/paper_cfa_fit_v9.csv", row.names = FALSE)
write.csv(paper_chisq_compare_v7, "../paper_draft/tables/paper_chisq_compare_v9.csv", row.names = FALSE)Interpretation:
- Compare models using both relative fit (
CFI,TLI,SRMR,AIC,BIC) and absolute fit (especiallyRMSEA). - A better-fitting model can still fit poorly in absolute terms; report both facts.
- To do: if
lavaanreports warnings (e.g., negative variances / Heywood cases), document them explicitly in the paper and treat the CFA results as provisional.
19 Appendix: Correlation Tables
19.1 Correlation Table: Rank and the Five Official Areas
rank_area_corr_df <- df_2022_manual_subcomponent %>%
select(Rank, Area1, Area2, Area3, Area4, Area5)
rank_area_corr <- cor(rank_area_corr_df, use = "pairwise.complete.obs", method = "pearson")
round(rank_area_corr, 3) Rank Area1 Area2 Area3 Area4 Area5
Rank 1.000 -0.055 -0.867 -0.737 -0.858 -0.855
Area1 -0.055 1.000 -0.211 -0.065 -0.080 -0.015
Area2 -0.867 -0.211 1.000 0.495 0.739 0.800
Area3 -0.737 -0.065 0.495 1.000 0.597 0.589
Area4 -0.858 -0.080 0.739 0.597 1.000 0.746
Area5 -0.855 -0.015 0.800 0.589 0.746 1.000
knitr::kable(
round(rank_area_corr, 3),
caption = "Correlation Table: Rank and the Five Official EFI Area Scores"
) %>%
kableExtra::kable_styling(full_width = FALSE)| Rank | Area1 | Area2 | Area3 | Area4 | Area5 | |
|---|---|---|---|---|---|---|
| Rank | 1.000 | -0.055 | -0.867 | -0.737 | -0.858 | -0.855 |
| Area1 | -0.055 | 1.000 | -0.211 | -0.065 | -0.080 | -0.015 |
| Area2 | -0.867 | -0.211 | 1.000 | 0.495 | 0.739 | 0.800 |
| Area3 | -0.737 | -0.065 | 0.495 | 1.000 | 0.597 | 0.589 |
| Area4 | -0.858 | -0.080 | 0.739 | 0.597 | 1.000 | 0.746 |
| Area5 | -0.855 | -0.015 | 0.800 | 0.589 | 0.746 | 1.000 |
19.2 Correlation Table: Rank + Components/Subcomponents by Area
area_component_lists <- list(
`Area 1` = c("X1A", "X1B", "X1C", "X1D_i", "X1D_ii", "X1E"),
`Area 2` = c("X2A_scaled", "X2B_scaled", "X2C_scaled", "X2D_scaled", "X2E_scaled", "X2F_scaled", "X2G_scaled", "X2H_scaled"),
`Area 3` = c("X3A", "X3B", "X3C", "X3D"),
`Area 4` = c("X4A_i", "X4A_ii", "X4A_iii", "X4B_i", "X4B_ii", "X4C", "X4D_i", "X4D_ii", "X4D_iii", "X4D_iv"),
`Area 5` = c("X5A_i", "X5A_ii", "X5A_iii", "X5B_i", "X5B_ii", "X5B_iii", "X5B_iv", "X5B_v", "X5B_vi", "X5B_vii",
"X5C_i", "X5C_ii", "X5C_iii", "X5C_iv", "X5D_i", "X5D_ii", "X5D_iii")
)
for (nm in names(area_component_lists)) {
d_plot <- df_2022_manual_subcomponent %>%
dplyr::select(Rank, all_of(area_component_lists[[nm]]))
print(
corplot(
d_plot,
title = paste0(nm, ": Rank + Components/Subcomponents"),
hc_order = FALSE
)
)
}19.3 Export XLS: Country, Year, Old/New Rank, Areas, Components, Subcomponents
area_cols <- c("Area1", "Area2", "Area3", "Area4", "Area5")
component_cols <- c(
"X1A", "X1B", "X1C", "X1D", "X1E",
"X2A", "X2B", "X2C", "X2D", "X2E", "X2F", "X2G", "X2H",
"X2A_scaled", "X2B_scaled", "X2C_scaled", "X2D_scaled", "X2E_scaled", "X2F_scaled", "X2G_scaled", "X2H_scaled",
"X3A", "X3B", "X3C", "X3D",
"X4A", "X4B", "X4C", "X4D",
"X5A", "X5B", "X5C", "X5D"
)
subcomponent_cols <- c(
"X1D_i", "X1D_ii",
"X4A_i", "X4A_ii", "X4A_iii",
"X4B_i", "X4B_ii",
"X4D_i", "X4D_ii", "X4D_iii", "X4D_iv",
"X5A_i", "X5A_ii", "X5A_iii",
"X5B_i", "X5B_ii", "X5B_iii", "X5B_iv", "X5B_v", "X5B_vi", "X5B_vii",
"X5C_i", "X5C_ii", "X5C_iii", "X5C_iv",
"X5D_i", "X5D_ii", "X5D_iii"
)
old_new_rank <- comparison %>%
transmute(
Countries,
Old_Rank = as.integer(Rank),
New_Rank = as.integer(Rank_Empirical_Weighted)
)
export_cols <- c("Countries", "Year", area_cols, component_cols, subcomponent_cols)
export_rank_hierarchy <- df_2022_manual_subcomponent %>%
select(all_of(intersect(export_cols, names(df_2022_manual_subcomponent)))) %>%
left_join(old_new_rank, by = "Countries") %>%
relocate(Countries, Year, Old_Rank, New_Rank)
dir.create("../data/clean", recursive = TRUE, showWarnings = FALSE)
write_xlsx(
list(country_year_old_new_rank_hierarchy = export_rank_hierarchy),
"../data/clean/country_year_old_new_rank_areas_components_subcomponents_v9.xlsx"
)Warning in write_xlsx(list(country_year_old_new_rank_hierarchy =
export_rank_hierarchy), : Truncating sheet name(s) to 31 characters
head(export_rank_hierarchy, 5)# A tibble: 5 × 70
Countries Year Old_Rank New_Rank Area1 Area2 Area3 Area4 Area5 X1A X1B
<chr> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Hong Kong S… 2022 1 16 7.34 7.49 9.53 9.66 8.86 6.57 9.00
2 Singapore 2022 2 6 7.32 8.40 8.71 9.56 8.73 4.37 9.08
3 Switzerland 2022 3 3 7.60 8.92 9.55 8.11 7.98 6.35 5.52
4 New Zealand 2022 4 2 6.39 9.00 8.83 8.94 8.78 3.95 5.27
5 United Stat… 2022 5 8 7.35 7.78 8.53 8.11 8.66 6.78 5.69
# ℹ 59 more variables: X1C <dbl>, X1D <dbl>, X1E <dbl>, X2A <dbl>, X2B <dbl>,
# X2C <dbl>, X2D <dbl>, X2E <dbl>, X2F <dbl>, X2G <dbl>, X2H <dbl>,
# X2A_scaled <labelled>, X2B_scaled <labelled>, X2C_scaled <labelled>,
# X2D_scaled <labelled>, X2E_scaled <labelled>, X2F_scaled <labelled>,
# X2G_scaled <labelled>, X2H_scaled <labelled>, X3A <dbl>, X3B <dbl>,
# X3C <dbl>, X3D <dbl>, X4A <dbl>, X4B <dbl>, X4C <dbl>, X4D <dbl>,
# X5A <dbl>, X5B <dbl>, X5C <dbl>, X5D <dbl>, X1D_i <dbl>, X1D_ii <dbl>, …