0.1 Introduction

This report presents a comparative analysis of three regression models used to predict national-level AI competitiveness scores. The models include:

Partial Least Squares Regression (PLSR)
Random Forest
Neural Network (Multilayer Perceptron)

The analysis evaluates the models based on their Root Mean Square Error (RMSE) on a standardized test dataset.

0.2 Data Preparation

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(readr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Load dataset
Ai_index <- readr::read_csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/AI_index_db.csv")

## Rows: 62 Columns: 13

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Country, Region, Cluster, Income group, Political regime
## dbl (8): Talent, Infrastructure, Operating Environment, Research, Developmen...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

gdp_raw <- readr::read_csv(
  "https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/gdp_gini_data.csv", 
  col_names = "raw"
)

## Rows: 210 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): raw
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

gdp_data <- gdp_raw %>%
  tidyr::separate(raw, into = c("Country", "GDP_Per_Capita", "Gini_Index", "GDP"), sep = ",", fill = "right") %>%
  mutate(across(everything(), trimws)) %>%
  mutate(
    GDP_Per_Capita = as.numeric(gsub("[^0-9.]", "", GDP_Per_Capita)),
    Gini_Index = as.numeric(gsub("[^0-9.]", "", Gini_Index)),
    GDP = as.numeric(gsub("[^0-9.]", "", GDP))
  )

Ai_index <- Ai_index %>% mutate(Country = trimws(Country))

merged_data <- Ai_index %>%
  left_join(gdp_data, by = "Country")

cleaned_data <- merged_data %>%
  mutate(
    RnD = (Research + Development) / 2,
    Policy = `Government Strategy`,
    AI_Score = `Total score`
  ) %>%
  select(Country, Region, `Income group`, `Political regime`,
         Talent, RnD, Policy, AI_Score,
         GDP_Per_Capita, GDP, Gini_Index) %>%
  filter(!is.na(AI_Score))

numeric_vars <- cleaned_data %>%
  select(where(is.numeric)) %>%
  scale(center = TRUE, scale = TRUE) %>%
  as.data.frame()

cleaned_data_scaled <- bind_cols(
  cleaned_data %>% select(-where(is.numeric)),
  numeric_vars
)

model_data <- cleaned_data_scaled %>%
  select(AI_Score, Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index) %>%
  na.omit()

set.seed(131017)
train_index <- createDataPartition(model_data$AI_Score, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data  <- model_data[-train_index, ]

1 NEW OUTPUT

# 1. Load necessary libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(caret)
library(corrplot)

## corrplot 0.95 loaded

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(nnet)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(ggcorrplot)
library(ggthemes)
library(tensorflow)

## 
## Attaching package: 'tensorflow'

## The following object is masked from 'package:caret':
## 
##     train

# 2. Load AI Index dataset
Ai_index <- readr::read_csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/AI_index_db.csv")

## Rows: 62 Columns: 13

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Country, Region, Cluster, Income group, Political regime
## dbl (8): Talent, Infrastructure, Operating Environment, Research, Developmen...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# 3. Load GDP and Gini data (1-column CSV, needs parsing)
gdp_raw <- readr::read_csv(
  "https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/gdp_gini_data.csv", 
  col_names = "raw"
)

## Rows: 210 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): raw
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# 4. Clean and separate GDP data
gdp_data <- gdp_raw %>%
  separate(raw, into = c("Country", "GDP_Per_Capita", "Gini_Index", "GDP"), sep = ",", fill = "right") %>%
  mutate(across(everything(), trimws)) %>%
  mutate(
    GDP_Per_Capita = as.numeric(gsub("[^0-9.]", "", GDP_Per_Capita)),
    Gini_Index = as.numeric(gsub("[^0-9.]", "", Gini_Index)),
    GDP = as.numeric(gsub("[^0-9.]", "", GDP))
  )

# 5. Clean AI Index country names
Ai_index <- Ai_index %>%
  mutate(Country = trimws(Country))

# 6. Merge datasets
merged_data <- Ai_index %>%
  left_join(gdp_data, by = "Country")

# 7. Clean and rename relevant variables
cleaned_data <- merged_data %>%
  mutate(
    RnD = (Research + Development) / 2,   # Average of Research and Development
    Policy = `Government Strategy`,       # Rename for clarity
    AI_Score = `Total score`              # Rename for modeling
  ) %>%
  select(
    Country, Region, `Income group`, `Political regime`,
    Talent, RnD, Policy, AI_Score,
    GDP_Per_Capita, GDP, Gini_Index
  )

# 8. Filter out rows with missing AI_Score (essential for modeling)
cleaned_data <- cleaned_data %>%
  filter(!is.na(AI_Score))

# 9. Standardize numeric variables using Z-scores
numeric_vars <- cleaned_data %>%
  select(where(is.numeric)) %>%
  scale(center = TRUE, scale = TRUE) %>%
  as.data.frame()

# 10. Combine standardized numerics with categorical columns
cleaned_data_scaled <- bind_cols(
  cleaned_data %>% select(-where(is.numeric)),
  numeric_vars
)

# 11. Create AI Competitiveness Tiers
cleaned_data_scaled <- cleaned_data_scaled %>%
  mutate(Tier = case_when(
    AI_Score >= quantile(AI_Score, 0.67, na.rm = TRUE) ~ "Leader",
    AI_Score >= quantile(AI_Score, 0.33, na.rm = TRUE) ~ "Follower",
    TRUE ~ "Laggard"
  ))

# 12. Quick data checks
glimpse(cleaned_data_scaled)

## Rows: 62
## Columns: 12
## $ Country            <chr> "United States of America", "China", "United Kingdo…
## $ Region             <chr> "Americas", "Asia-Pacific", "Europe", "Americas", "…
## $ `Income group`     <chr> "High", "Upper middle", "High", "High", "High", "Hi…
## $ `Political regime` <chr> "Liberal democracy", "Closed autocracy", "Liberal d…
## $ Talent             <dbl> 5.46809990, -0.01926160, 1.50160970, 0.95149333, 1.…
## $ RnD                <dbl> 4.76303426, 3.38949496, 0.85038281, 0.70684075, 0.8…
## $ Policy             <dbl> 0.74371558, 1.40955823, 0.95055343, 1.60496858, -0.…
## $ AI_Score           <dbl> 5.0309049, 2.5791054, 1.1250852, 1.0761550, 1.05631…
## $ GDP_Per_Capita     <dbl> NA, -0.83007921, 0.64426449, 0.65844087, 0.43161876…
## $ GDP                <dbl> NA, -0.8959691, -0.9568674, -0.9609821, 1.1782757, …
## $ Gini_Index         <dbl> NA, 0.468006500, 0.003676819, -0.242144777, 0.53629…
## $ Tier               <chr> "Leader", "Leader", "Leader", "Leader", "Leader", "…

table(cleaned_data_scaled$Tier)

## 
## Follower  Laggard   Leader 
##       20       21       21

summary(cleaned_data_scaled)

##    Country             Region          Income group       Political regime  
##  Length:62          Length:62          Length:62          Length:62         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      Talent             RnD              Policy           AI_Score       
##  Min.   :-1.1044   Min.   :-0.8710   Min.   :-2.2042   Min.   :-1.58128  
##  1st Qu.:-0.6203   1st Qu.:-0.6926   1st Qu.:-0.6413   1st Qu.:-0.60235  
##  Median :-0.2207   Median :-0.3252   Median : 0.2310   Median :-0.04593  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.5103   3rd Qu.: 0.4011   3rd Qu.: 0.7651   3rd Qu.: 0.43461  
##  Max.   : 5.4681   Max.   : 4.7630   Max.   : 1.6050   Max.   : 5.03090  
##                                                                          
##  GDP_Per_Capita         GDP            Gini_Index          Tier          
##  Min.   :-1.1467   Min.   :-0.9651   Min.   :-1.4303   Length:62         
##  1st Qu.:-0.8088   1st Qu.:-0.9244   1st Qu.:-0.6553   Class :character  
##  Median :-0.3245   Median :-0.4174   Median :-0.2285   Mode  :character  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000                     
##  3rd Qu.: 0.6868   3rd Qu.: 0.7073   3rd Qu.: 0.4851                     
##  Max.   : 3.0259   Max.   : 2.7583   Max.   : 3.8139                     
##  NA's   :3         NA's   :3         NA's   :10

# 13. (Optional) Save the cleaned dataset
# readr::write_csv(cleaned_data_scaled, "cleaned_ai_gdp_data.csv")

table(cleaned_data_scaled$Tier)

## 
## Follower  Laggard   Leader 
##       20       21       21

1.1 EDA

1.1.1 Summary Statistics Table (Grouped by AI Tier)

# Generate summary stats per Tier
summary_stats <- cleaned_data_scaled %>%
  group_by(Tier) %>%
  summarise(
    Talent_Mean = mean(Talent, na.rm = TRUE),
    Talent_SD = sd(Talent, na.rm = TRUE),
    RnD_Mean = mean(RnD, na.rm = TRUE),
    RnD_SD = sd(RnD, na.rm = TRUE),
    Policy_Mean = mean(Policy, na.rm = TRUE),
    Policy_SD = sd(Policy, na.rm = TRUE),
    GDP_Per_Capita_Mean = mean(GDP_Per_Capita, na.rm = TRUE),
    GDP_Per_Capita_SD = sd(GDP_Per_Capita, na.rm = TRUE),
    GDP_Mean = mean(GDP, na.rm = TRUE),
    GDP_SD = sd(GDP, na.rm = TRUE),
    Gini_Index_Mean = mean(Gini_Index, na.rm = TRUE),
    Gini_Index_SD = sd(Gini_Index, na.rm = TRUE),
    .groups = "drop"
  )

# Create journal-style table
summary_stats %>%
  kable(format = "html", digits = 2, caption = "Descriptive Statistics by AI Competitiveness Tier") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Descriptive Statistics by AI Competitiveness Tier
Tier	Talent_Mean	Talent_SD	RnD_Mean	RnD_SD	Policy_Mean	Policy_SD	GDP_Per_Capita_Mean	GDP_Per_Capita_SD	GDP_Mean	GDP_SD	Gini_Index_Mean	Gini_Index_SD
Follower	-0.20	0.38	-0.23	0.34	0.24	0.72	0.05	0.76	-0.07	0.96	-0.48	0.97
Laggard	-0.72	0.21	-0.75	0.12	-0.73	0.97	-0.77	0.57	0.14	0.98	0.63	1.10
Leader	0.92	1.18	0.97	1.14	0.50	0.86	0.80	0.95	-0.09	1.09	-0.25	0.43

1.1.2 ANOVA Tests by Tier

# Run ANOVA and tidy results
vars <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index")

anova_results <- lapply(vars, function(var) {
  formula <- as.formula(paste(var, "~ Tier"))
  aov_model <- aov(formula, data = cleaned_data_scaled)
  tidy_res <- summary(aov_model)[[1]]
  data.frame(Variable = var, 
             F = tidy_res$`F value`[1], 
             p_value = tidy_res$`Pr(>F)`[1])
})

anova_table <- bind_rows(anova_results)

# Display in table format
anova_table %>%
  mutate(Significance = case_when(
    p_value < 0.001 ~ "***",
    p_value < 0.01 ~ "**",
    p_value < 0.05 ~ "*",
    TRUE ~ ""
  )) %>%
  kable(format = "html", digits = 3, caption = "ANOVA Results by AI Tier") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

ANOVA Results by AI Tier
Variable	F	p_value	Significance
Talent	27.497	0.000	***
RnD	33.531	0.000	***
Policy	12.010	0.000	***
GDP_Per_Capita	20.756	0.000	***
GDP	0.310	0.735
Gini_Index	7.704	0.001	**

anova_table

##         Variable          F      p_value
## 1         Talent 27.4965605 3.649340e-09
## 2            RnD 33.5311907 1.874398e-10
## 3         Policy 12.0103149 4.208687e-05
## 4 GDP_Per_Capita 20.7563143 1.801194e-07
## 5            GDP  0.3096422 7.349575e-01
## 6     Gini_Index  7.7037319 1.232728e-03

# Load libraries
library(gt)
library(dplyr)

# Create the data frame
ai_significance_table <- data.frame(
  Variable = c("RnD", "Talent", "GDP/Capita", "Policy", "Gini Index", "GDP (total)"),
  `F-Statistic` = c(33.53, 27.50, 20.76, 12.01, 7.70, 0.31),
  `p-value` = c(1.87e-10, 3.65e-09, 1.80e-07, 4.21e-05, 0.00123, 0.73496),
  check.names = FALSE  # preserve column names with hyphens
)

# Generate the table
ai_significance_table %>%
  gt() %>%
  tab_header(
    title = md("**Statistical Significance of Selected Predictors with Respect to AI_Score**")
  ) %>%
  fmt_number(
    columns = all_of("F-Statistic"),
    decimals = 2
  ) %>%
  fmt_scientific(
    columns = all_of("p-value"),
    decimals = 2
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = "Variable")
  ) %>%
  cols_label(
    Variable = "Variable",
    `F-Statistic` = "F-Statistic",
    `p-value` = "p-value"
  ) %>%
  tab_options(
    table.font.size = 12,
    table.width = pct(90),
    heading.title.font.size = 14,
    heading.title.font.weight = "bold"
  )

Variable	F-Statistic	p-value
Statistical Significance of Selected Predictors with Respect to AI_Score
RnD	33.53	1.87 × 10⁻¹⁰
Talent	27.50	3.65 × 10⁻⁹
GDP/Capita	20.76	1.80 × 10⁻⁷
Policy	12.01	4.21 × 10⁻⁵
Gini Index	7.70	1.23 × 10⁻³
GDP (total)	0.31	7.35 × 10⁻¹

# Load libraries
library(gt)
library(dplyr)

# Create the data frame with significance stars
ai_significance_table <- data.frame(
  Variable = c("RnD", "Talent", "GDP/Capita", "Policy", "Gini Index", "GDP (total)"),
  `F-Statistic` = c(33.53, 27.50, 20.76, 12.01, 7.70, 0.31),
  `p-value` = c(1.87e-10, 3.65e-09, 1.80e-07, 4.21e-05, 0.00123, 0.73496),
  check.names = FALSE
) %>%
  mutate(
    Significance = case_when(
      `p-value` < 0.001 ~ "***",
      `p-value` < 0.01  ~ "**",
      `p-value` < 0.05  ~ "*",
      TRUE              ~ ""
    ),
    `p-value` = paste0(formatC(`p-value`, format = "e", digits = 2), " ", Significance)
  ) %>%
  select(-Significance)  # Remove intermediate column

# Generate the styled table
ai_significance_table %>%
  gt() %>%
  tab_header(
    title = md("**Statistical Significance of Selected Predictors with Respect to AI_Score**")
  ) %>%
  fmt_number(
    columns = all_of("F-Statistic"),
    decimals = 2
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = "Variable")
  ) %>%
  cols_label(
    Variable = "Variable",
    `F-Statistic` = "F-Statistic",
    `p-value` = "p-value"
  ) %>%
  tab_options(
    table.font.size = 12,
    table.width = pct(90),
    heading.title.font.size = 14,
    heading.title.font.weight = "bold"
  )

Variable	F-Statistic	p-value
Statistical Significance of Selected Predictors with Respect to AI_Score
RnD	33.53	1.87e-10 ***
Talent	27.50	3.65e-09 ***
GDP/Capita	20.76	1.80e-07 ***
Policy	12.01	4.21e-05 ***
Gini Index	7.70	1.23e-03 **
GDP (total)	0.31	7.35e-01

1.1.3 Boxplots by Tier

# Melt data for boxplot


plot_data <- cleaned_data_scaled %>%
  select(Tier, Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index) %>%
  pivot_longer(-Tier, names_to = "Variable", values_to = "Value")

# Boxplot
ggplot(plot_data, aes(x = Tier, y = Value, fill = Tier)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  facet_wrap(~Variable, scales = "free", ncol = 3) +
  labs(title = "Distribution of AI Determinants by Tier", x = "AI Competitiveness Tier", y = "Standardized Value") +
  theme_minimal() +
  theme(legend.position = "none")

## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Load necessary packages
library(dplyr)
library(knitr)
library(kableExtra)

# Define numeric columns
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")

# Summarize the data
summary_data <- cleaned_data_scaled %>%
  select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
  group_by(Region, `Income group`, `Political regime`) %>%
  summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
  mutate(across(all_of(numeric_cols), ~ round(.x, 2)))  # Optional: Round for presentation

# Render LaTeX-safe table
summary_data %>%
  kable(format = "latex", booktabs = TRUE, caption = "Summary of AI Indicators by Region, Income Group, and Political Regime") %>%
  kable_styling(latex_options = c("striped", "scale_down", "hold_position"))

1.1.4 Correlation Matrix and Heatmap

# Load necessary package
library(ggcorrplot)

# Compute correlation matrix of numeric variables
cor_matrix <- cleaned_data_scaled %>%
  select(Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index, AI_Score) %>%
  cor(use = "pairwise.complete.obs")

# Plot correlation heatmap
corr_fig <- ggcorrplot(cor_matrix, 
                       hc.order = TRUE, 
                       type = "lower", 
                       lab = TRUE, 
                       lab_size = 3, 
                       method = "circle", 
                       colors = c("red", "white", "blue"),
                       title = "Correlation Matrix of AI Determinants",
                       ggtheme = theme_minimal(base_size = 12))

# Display the plot
print(corr_fig)

library(knitr)
library(kableExtra)
library(tibble)

# Original matrix input
cor_matrix <- matrix(c(
  1.000, 0.773, 0.322, 0.561, 0.025, -0.294, 0.862,
  0.773, 1.000, 0.422, 0.446, -0.102, -0.208, 0.941,
  0.322, 0.422, 1.000, 0.184, -0.158, -0.224, 0.532,
  0.561, 0.446, 0.184, 1.000, 0.183, -0.417, 0.547,
  0.025, -0.102, -0.158, 0.183, 1.000, -0.019, -0.105,
  -0.294, -0.208, -0.224, -0.417, -0.019, 1.000, -0.257,
  0.862, 0.941, 0.532, 0.547, -0.105, -0.257, 1.000
), nrow = 7, byrow = TRUE)

vars <- c("Talent", "RnD", "Policy", "GDP_PC", "GDP", "Gini", "AI_Score")
dimnames(cor_matrix) <- list(vars, vars)

# Keep only lower triangle and blank out upper
lower_tri <- cor_matrix
lower_tri[upper.tri(lower_tri)] <- ""

# Convert to data frame, round values, and format
cor_df <- as.data.frame(lower_tri)
cor_df <- apply(cor_df, 2, function(x) ifelse(x == "", "", sprintf("%.3f", as.numeric(x))))
cor_df <- as.data.frame(cor_df)
cor_df <- rownames_to_column(cor_df, " ")

# Display clean table
kable(cor_df, format = "html", escape = FALSE,
      caption = "Table: Correlation Matrix of AI Competitiveness Indicators (Lower Triangle)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE)

Table: Correlation Matrix of AI Competitiveness Indicators (Lower Triangle)
	Talent	RnD	Policy	GDP_PC	GDP	Gini	AI_Score
Talent	1.000
RnD	0.773	1.000
Policy	0.322	0.422	1.000
GDP_PC	0.561	0.446	0.184	1.000
GDP	0.025	-0.102	-0.158	0.183	1.000
Gini	-0.294	-0.208	-0.224	-0.417	-0.019	1.000
AI_Score	0.862	0.941	0.532	0.547	-0.105	-0.257	1.000

# Save correlation heatmap
ggsave("AI_Correlation_Matrix.png", plot = corr_fig, width = 8, height = 6, dpi = 300)

Post-Hoc Tukey HSD Test (per variable)

# Run ANOVA and Tukey HSD for each numeric variable
tukey_results <- list()

for (var in c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")) {
  aov_model <- aov(scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
  tukey_results[[var]] <- TukeyHSD(aov_model)
  print(paste("Tukey HSD for:", var))
  print(tukey_results[[var]])
}

## [1] "Tukey HSD for: Talent"
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
## 
## $`cleaned_data_scaled$Tier`
##                        diff        lwr        upr     p adj
## Laggard-Follower -0.5228938 -1.0723991 0.02661154 0.0653824
## Leader-Follower   1.1168768  0.5673714 1.66638209 0.0000243
## Leader-Laggard    1.6397706  1.0970079 2.18253323 0.0000000
## 
## [1] "Tukey HSD for: RnD"
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
## 
## $`cleaned_data_scaled$Tier`
##                        diff        lwr        upr     p adj
## Laggard-Follower -0.5115422 -1.0340809 0.01099655 0.0562536
## Leader-Follower   1.2021358  0.6795971 1.72467456 0.0000023
## Leader-Laggard    1.7136780  1.1975511 2.22980498 0.0000000
## 
## [1] "Tukey HSD for: Policy"
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
## 
## $`cleaned_data_scaled$Tier`
##                        diff        lwr        upr     p adj
## Laggard-Follower -0.9735183 -1.6174180 -0.3296186 0.0016712
## Leader-Follower   0.2575506 -0.3863491  0.9014503 0.6037905
## Leader-Laggard    1.2310688  0.5950701  1.8670676 0.0000556
## 
## [1] "Tukey HSD for: GDP_Per_Capita"
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
## 
## $`cleaned_data_scaled$Tier`
##                        diff        lwr        upr     p adj
## Laggard-Follower -0.8162374 -1.4041393 -0.2283356 0.0041734
## Leader-Follower   0.7550789  0.1526587  1.3574990 0.0105316
## Leader-Laggard    1.5713163  0.9834144  2.1592182 0.0000001
## 
## [1] "Tukey HSD for: GDP"
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
## 
## $`cleaned_data_scaled$Tier`
##                         diff        lwr       upr     p adj
## Laggard-Follower  0.20488252 -0.5666479 0.9764130 0.7991098
## Leader-Follower  -0.02182983 -0.8124133 0.7687536 0.9975666
## Leader-Laggard   -0.22671234 -0.9982428 0.5448181 0.7601101
## 
## [1] "Tukey HSD for: Gini_Index"
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
## 
## $`cleaned_data_scaled$Tier`
##                        diff        lwr        upr     p adj
## Laggard-Follower  1.1035916  0.3738370  1.8333462 0.0017832
## Leader-Follower   0.2226252 -0.5264961  0.9717465 0.7539105
## Leader-Laggard   -0.8809664 -1.5989750 -0.1629579 0.0127160
## 
## [1] "Tukey HSD for: AI_Score"
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
## 
## $`cleaned_data_scaled$Tier`
##                        diff       lwr        upr    p adj
## Laggard-Follower -0.8025245 -1.296882 -0.3081668 7.12e-04
## Leader-Follower   1.0302797  0.535922  1.5246374 1.55e-05
## Leader-Laggard    1.8328042  1.344512  2.3210959 0.00e+00

library(knitr)
library(kableExtra)

# Define the Tukey HSD results as a data frame
tukey_data <- data.frame(
  Variable = c("Talent", "Talent", "Talent", "Talent",
               "RnD", "RnD", "RnD", "RnD",
               "Policy", "Policy", "Policy", "Policy",
               "GDP_Per_Capita", "GDP_Per_Capita", "GDP_Per_Capita", "GDP_Per_Capita",
               "GDP", "GDP", "GDP", "GDP",
               "Gini_Index", "Gini_Index", "Gini_Index", "Gini_Index",
               "AI_Score", "AI_Score", "AI_Score", "AI_Score"),
  Comparison = c("Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
                 "Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
                 "Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
                 "Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
                 "Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
                 "Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
                 "Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj"),
  Diff = c(-0.5228938, 1.1168768, 1.6397706, 0.0653824,
           -0.5115422, 1.2021358, 1.7136780, 0.0562536,
           -0.9735183, 0.2575506, 1.2310688, 0.6037905,
           -0.8162374, 0.7550789, 1.5713163, 0.0105316,
           0.20488252, -0.02182983, -0.22671234, 0.7991098,
           1.1035916, 0.2226252, -0.8809664, 0.7539105,
           -0.8025245, 1.0302797, 1.8328042, 1.55e-05),
  Lwr = c(-1.0723991, 0.5673714, 1.0970079, -1.296882,
          -1.0340809, 0.6795971, 1.1975511, -1.296882,
          -1.6174180, -0.3863491, 0.5950701, -1.296882,
          -1.4041393, 0.1526587, 0.9834144, -0.2283356,
          -0.5666479, -0.8124133, -0.9982428, -0.2283356,
          0.3738370, -0.5264961, -1.5989750, -0.5264961,
          -1.296882, 0.535922, 1.344512, 0.535922),
  Upr = c(0.02661154, 1.66638209, 2.18253323, 0.02661154,
          0.01099655, 1.72467456, 2.22980498, 1.72467456,
          -0.3296186, 0.9014503, 1.8670676, 0.9014503,
          -0.2283356, 1.3574990, 2.1592182, 1.3574990,
          0.9764130, 0.7687536, 0.5448181, 0.9764130,
          1.8333462, 0.9717465, -0.1629579, 1.8333462,
          -0.3081668, 1.5246374, 2.3210959, 1.5246374),
  P_Adj = c(0.0653824, 0.0000243, 0.0000000, 7.12e-04,
            0.0562536, 0.0000023, 0.0000000, 0.0000023,
            0.0016712, 0.6037905, 0.0000556, 0.6037905,
            0.0041734, 0.0105316, 0.0000001, 0.0105316,
            0.7991098, 0.9975666, 0.7601101, 0.9975666,
            0.0017832, 0.7539105, 0.0127160, 0.7539105,
            7.12e-04, 1.55e-05, 0.00e+00, 1.55e-05)
)

# Create the table
tukey_data %>%
  kable("html", caption = "Tukey HSD Test Results for Variables by Tier") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  column_spec(1, width = "1.5in") %>%
  column_spec(2, width = "2.5in") %>%
  column_spec(3, width = "1in") %>%
  column_spec(4, width = "1in") %>%
  column_spec(5, width = "1in") %>%
  column_spec(6, width = "1in")

Tukey HSD Test Results for Variables by Tier
Variable	Comparison	Diff	Lwr	Upr	P_Adj
Talent	Laggard-Follower	-0.5228938	-1.0723991	0.0266115	0.0653824
Talent	Leader-Follower	1.1168768	0.5673714	1.6663821	0.0000243
Talent	Leader-Laggard	1.6397706	1.0970079	2.1825332	0.0000000
Talent	p adj	0.0653824	-1.2968820	0.0266115	0.0007120
RnD	Laggard-Follower	-0.5115422	-1.0340809	0.0109966	0.0562536
RnD	Leader-Follower	1.2021358	0.6795971	1.7246746	0.0000023
RnD	Leader-Laggard	1.7136780	1.1975511	2.2298050	0.0000000
RnD	p adj	0.0562536	-1.2968820	1.7246746	0.0000023
Policy	Laggard-Follower	-0.9735183	-1.6174180	-0.3296186	0.0016712
Policy	Leader-Follower	0.2575506	-0.3863491	0.9014503	0.6037905
Policy	Leader-Laggard	1.2310688	0.5950701	1.8670676	0.0000556
Policy	p adj	0.6037905	-1.2968820	0.9014503	0.6037905
GDP_Per_Capita	Laggard-Follower	-0.8162374	-1.4041393	-0.2283356	0.0041734
GDP_Per_Capita	Leader-Follower	0.7550789	0.1526587	1.3574990	0.0105316
GDP_Per_Capita	Leader-Laggard	1.5713163	0.9834144	2.1592182	0.0000001
GDP_Per_Capita	p adj	0.0105316	-0.2283356	1.3574990	0.0105316
GDP	Laggard-Follower	0.2048825	-0.5666479	0.9764130	0.7991098
GDP	Leader-Follower	-0.0218298	-0.8124133	0.7687536	0.9975666
GDP	Leader-Laggard	-0.2267123	-0.9982428	0.5448181	0.7601101
GDP	p adj	0.7991098	-0.2283356	0.9764130	0.9975666
Gini_Index	Laggard-Follower	1.1035916	0.3738370	1.8333462	0.0017832
Gini_Index	Leader-Follower	0.2226252	-0.5264961	0.9717465	0.7539105
Gini_Index	Leader-Laggard	-0.8809664	-1.5989750	-0.1629579	0.0127160
Gini_Index	p adj	0.7539105	-0.5264961	1.8333462	0.7539105
AI_Score	Laggard-Follower	-0.8025245	-1.2968820	-0.3081668	0.0007120
AI_Score	Leader-Follower	1.0302797	0.5359220	1.5246374	0.0000155
AI_Score	Leader-Laggard	1.8328042	1.3445120	2.3210959	0.0000000
AI_Score	p adj	0.0000155	0.5359220	1.5246374	0.0000155

Correlation Matrix with Significance (using psych + corrplot)

library(psych)
library(corrplot)

# Numeric data
numeric_data <- cleaned_data_scaled %>%
  select(Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index, AI_Score)

# Correlation and p-values
cor_result <- corr.test(numeric_data, use = "pairwise.complete.obs", method = "pearson")

# Plot significant correlation matrix
corrplot(cor_result$r, 
         method = "color", 
         type = "lower", 
         p.mat = cor_result$p, 
         sig.level = 0.05, 
         insig = "blank", 
         tl.cex = 0.8,
         addCoef.col = "black",
         col = colorRampPalette(c("red", "white", "blue"))(200),
         title = "Correlation Matrix (Significant at p < 0.05)",
         mar = c(0,0,2,0))

Pairwise Plot (GGally)

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

# Pairwise scatterplots with Tier as color
ggpairs(cleaned_data_scaled %>%
           select(Tier, Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index, AI_Score),
         mapping = aes(color = Tier, alpha = 0.7),
         title = "Pairwise Scatterplots of Key AI Indicators")

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).

4. Heatmap of Standardized Variables by Country

library(reshape2)

library(pheatmap)
# Unload plyr if it is loaded
if("package:plyr" %in% search()) {
  detach("package:plyr", unload = TRUE)
}


# Ensure the numeric column names are correctly defined
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")

# Step 1: Summarize the numeric indicators by grouping variables
summary_data <- cleaned_data_scaled %>%
  select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
  group_by(Region, `Income group`, `Political regime`) %>%
  summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop")

# Check the result
head(summary_data)

## # A tibble: 6 × 10
##   Region   `Income group` `Political regime` Talent    RnD Policy GDP_Per_Capita
##   <chr>    <chr>          <chr>               <dbl>  <dbl>  <dbl>          <dbl>
## 1 Africa   Lower middle   Closed autocracy   -0.884 -0.846 -1.60          -1.08 
## 2 Africa   Lower middle   Electoral autocra… -0.990 -0.680 -1.91          -1.13 
## 3 Africa   Lower middle   Electoral democra… -0.517 -0.778 -1.74          -1.05 
## 4 Africa   Upper middle   Electoral democra… -0.801 -0.652 -2.20          -0.972
## 5 Americas High           Liberal democracy   1.35   0.946  0.356         -0.143
## 6 Americas Upper middle   Electoral democra… -0.635 -0.692  0.292         -0.802
## # ℹ 3 more variables: GDP <dbl>, Gini_Index <dbl>, AI_Score <dbl>

# Check for package conflicts (Make sure plyr is not loaded)
if("package:plyr" %in% search()) {
  detach("package:plyr", unload = TRUE)
}

# Ensure numeric columns are correctly defined
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")

# Summarize numeric indicators by grouping variables
summary_data <- cleaned_data_scaled %>%
  select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
  group_by(Region, `Income group`, `Political regime`) %>%
  dplyr::summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop")

# Check the result
head(summary_data)

## # A tibble: 6 × 10
##   Region   `Income group` `Political regime` Talent    RnD Policy GDP_Per_Capita
##   <chr>    <chr>          <chr>               <dbl>  <dbl>  <dbl>          <dbl>
## 1 Africa   Lower middle   Closed autocracy   -0.884 -0.846 -1.60          -1.08 
## 2 Africa   Lower middle   Electoral autocra… -0.990 -0.680 -1.91          -1.13 
## 3 Africa   Lower middle   Electoral democra… -0.517 -0.778 -1.74          -1.05 
## 4 Africa   Upper middle   Electoral democra… -0.801 -0.652 -2.20          -0.972
## 5 Americas High           Liberal democracy   1.35   0.946  0.356         -0.143
## 6 Americas Upper middle   Electoral democra… -0.635 -0.692  0.292         -0.802
## # ℹ 3 more variables: GDP <dbl>, Gini_Index <dbl>, AI_Score <dbl>

# Load required packages
library(dplyr)
library(gt)

# Create the summary table
summary_table <- cleaned_data_scaled %>%
  select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
  group_by(Region, `Income group`, `Political regime`) %>%
  summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop")

# Generate a professional table with gt
summary_table %>%
  gt() %>%
  tab_header(
    title = "Summary of AI Indicators by Region, Income Group, and Political Regime",
    subtitle = "Mean values of standardized indicators"
  ) %>%
  fmt_number(columns = numeric_cols, decimals = 2) %>%
  cols_label(
    Region = "Region",
    `Income group` = "Income Group",
    `Political regime` = "Political Regime",
    Talent = "Talent",
    RnD = "R&D",
    Policy = "Policy",
    GDP_Per_Capita = "GDP per Capita",
    GDP = "GDP",
    Gini_Index = "Gini Index",
    AI_Score = "AI Score"
  ) %>%
  tab_options(
    table.font.size = px(12),
    heading.title.font.size = px(16),
    heading.subtitle.font.size = px(12),
    column_labels.font.weight = "bold"
  )

Region	Income Group	Political Regime	Talent	R&D	Policy	GDP per Capita	GDP	Gini Index	AI Score
Summary of AI Indicators by Region, Income Group, and Political Regime
Mean values of standardized indicators
Africa	Lower middle	Closed autocracy	−0.88	−0.85	−1.60	−1.08	−0.42	0.60	−0.99
Africa	Lower middle	Electoral autocracy	−0.99	−0.68	−1.91	−1.13	0.25	0.39	−1.46
Africa	Lower middle	Electoral democracy	−0.52	−0.78	−1.74	−1.05	−0.78	−0.31	−0.93
Africa	Upper middle	Electoral democracy	−0.80	−0.65	−2.20	−0.97	0.70	3.81	−0.94
Americas	High	Liberal democracy	1.35	0.95	0.36	−0.14	−0.46	0.55	1.21
Americas	Upper middle	Electoral democracy	−0.64	−0.69	0.29	−0.80	0.04	1.80	−0.53
Asia-Pacific	High	Electoral democracy	0.77	0.53	−0.05	0.98	0.73	NaN	0.66
Asia-Pacific	High	Liberal democracy	0.09	0.80	0.43	0.60	−0.71	−0.30	0.45
Asia-Pacific	Lower middle	Closed autocracy	−0.69	−0.82	0.42	−1.10	0.71	0.09	−0.81
Asia-Pacific	Lower middle	Electoral autocracy	0.65	−0.14	−0.82	−1.14	−0.19	−0.19	−0.58
Asia-Pacific	Lower middle	Electoral democracy	−0.72	−0.81	−0.38	−1.04	−0.81	0.50	−0.98
Asia-Pacific	Upper middle	Closed autocracy	−0.02	3.39	1.41	−0.83	−0.90	0.47	2.58
Asia-Pacific	Upper middle	Electoral autocracy	−0.42	−0.71	−0.39	−0.80	0.70	0.82	−0.48
Europe	High	Electoral autocracy	−0.42	−0.61	−0.11	−0.62	−0.23	−0.75	−0.46
Europe	High	Electoral democracy	−0.26	−0.42	0.41	−0.28	0.34	−0.52	−0.17
Europe	High	Liberal democracy	0.45	0.27	0.17	0.99	0.09	−0.53	0.36
Europe	Upper middle	Electoral autocracy	−0.29	0.06	1.24	−0.80	−0.96	0.33	−0.13
Europe	Upper middle	Electoral democracy	−0.66	−0.87	−1.66	−1.02	−0.89	−0.71	−1.02
Middle East	High	Closed autocracy	−0.91	−0.44	−0.07	0.35	−0.16	−1.24	−0.37
Middle East	High	Liberal democracy	1.25	0.82	−0.53	0.43	1.18	0.54	1.06
Middle East	Lower middle	Electoral autocracy	−1.03	−0.79	0.41	−1.09	0.99	−0.49	−1.26

# Load required packages
library(dplyr)
library(knitr)
library(kableExtra)

# Create the summary table
summary_table <- cleaned_data_scaled %>%
  select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
  group_by(Region, `Income group`, `Political regime`) %>%
  summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop")

# Display with kableExtra
summary_table %>%
  mutate(across(all_of(numeric_cols), ~ round(.x, 2))) %>%
  kable("html", caption = "Summary of AI Indicators by Region, Income Group, and Political Regime") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                full_width = FALSE, 
                position = "center")

Summary of AI Indicators by Region, Income Group, and Political Regime
Region	Income group	Political regime	Talent	RnD	Policy	GDP_Per_Capita	GDP	Gini_Index	AI_Score
Africa	Lower middle	Closed autocracy	-0.88	-0.85	-1.60	-1.08	-0.42	0.60	-0.99
Africa	Lower middle	Electoral autocracy	-0.99	-0.68	-1.91	-1.13	0.25	0.39	-1.46
Africa	Lower middle	Electoral democracy	-0.52	-0.78	-1.74	-1.05	-0.78	-0.31	-0.93
Africa	Upper middle	Electoral democracy	-0.80	-0.65	-2.20	-0.97	0.70	3.81	-0.94
Americas	High	Liberal democracy	1.35	0.95	0.36	-0.14	-0.46	0.55	1.21
Americas	Upper middle	Electoral democracy	-0.64	-0.69	0.29	-0.80	0.04	1.80	-0.53
Asia-Pacific	High	Electoral democracy	0.77	0.53	-0.05	0.98	0.73	NaN	0.66
Asia-Pacific	High	Liberal democracy	0.09	0.80	0.43	0.60	-0.71	-0.30	0.45
Asia-Pacific	Lower middle	Closed autocracy	-0.69	-0.82	0.42	-1.10	0.71	0.09	-0.81
Asia-Pacific	Lower middle	Electoral autocracy	0.65	-0.14	-0.82	-1.14	-0.19	-0.19	-0.58
Asia-Pacific	Lower middle	Electoral democracy	-0.72	-0.81	-0.38	-1.04	-0.81	0.50	-0.98
Asia-Pacific	Upper middle	Closed autocracy	-0.02	3.39	1.41	-0.83	-0.90	0.47	2.58
Asia-Pacific	Upper middle	Electoral autocracy	-0.42	-0.71	-0.39	-0.80	0.70	0.82	-0.48
Europe	High	Electoral autocracy	-0.42	-0.61	-0.11	-0.62	-0.23	-0.75	-0.46
Europe	High	Electoral democracy	-0.26	-0.42	0.41	-0.28	0.34	-0.52	-0.17
Europe	High	Liberal democracy	0.45	0.27	0.17	0.99	0.09	-0.53	0.36
Europe	Upper middle	Electoral autocracy	-0.29	0.06	1.24	-0.80	-0.96	0.33	-0.13
Europe	Upper middle	Electoral democracy	-0.66	-0.87	-1.66	-1.02	-0.89	-0.71	-1.02
Middle East	High	Closed autocracy	-0.91	-0.44	-0.07	0.35	-0.16	-1.24	-0.37
Middle East	High	Liberal democracy	1.25	0.82	-0.53	0.43	1.18	0.54	1.06
Middle East	Lower middle	Electoral autocracy	-1.03	-0.79	0.41	-1.09	0.99	-0.49	-1.26

# Load necessary packages
library(dplyr)
library(tibble)
library(pheatmap)

# Detach plyr if loaded (to prevent conflict with dplyr)
if ("package:plyr" %in% search()) {
  detach("package:plyr", unload = TRUE)
}

# Define numeric columns
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")

# Summarize the data by Region
region_summary <- cleaned_data_scaled %>%
  group_by(Region) %>%
  summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
  mutate(Group = paste("Region", Region, sep = "_")) %>%
  select(Group, all_of(numeric_cols)) %>%
  column_to_rownames("Group") %>%
  as.matrix()

# Plot heatmap for Region
pheatmap(region_summary,
         scale = "none",
         clustering_distance_rows = "euclidean",
         clustering_distance_cols = "euclidean",
         clustering_method = "ward.D2",
         fontsize = 10,
         main = "Heatmap: Standardized AI Metrics by Region")

# Summarize the data by Income group
income_summary <- cleaned_data_scaled %>%
  group_by(`Income group`) %>%
  summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
  mutate(Group = paste("Income", `Income group`, sep = "_")) %>%
  select(Group, all_of(numeric_cols)) %>%
  column_to_rownames("Group") %>%
  as.matrix()

# Plot heatmap for Income group
pheatmap(income_summary,
         scale = "none",
         clustering_distance_rows = "euclidean",
         clustering_distance_cols = "euclidean",
         clustering_method = "ward.D2",
         fontsize = 10,
         main = "Heatmap: Standardized AI Metrics by Income Group")

# Summarize the data by Political regime
regime_summary <- cleaned_data_scaled %>%
  group_by(`Political regime`) %>%
  summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
  mutate(Group = paste("Regime", `Political regime`, sep = "_")) %>%
  select(Group, all_of(numeric_cols)) %>%
  column_to_rownames("Group") %>%
  as.matrix()

# Plot heatmap for Political regime
pheatmap(regime_summary,
         scale = "none",
         clustering_distance_rows = "euclidean",
         clustering_distance_cols = "euclidean",
         clustering_method = "ward.D2",
         fontsize = 10,
         main = "Heatmap: Standardized AI Metrics by Political Regime")

# Add a unique identifier for each row
region_data <- summary_data %>%
  mutate(UniqueID = row_number()) %>%
  select(UniqueID, `Talent`, `RnD`, `Policy`, `GDP_Per_Capita`, `GDP`, `Gini_Index`, `AI_Score`) %>%
  column_to_rownames("UniqueID") %>%  # Set unique ID as row names
  as.matrix()

# Check the result
head(region_data)

##       Talent        RnD     Policy GDP_Per_Capita         GDP Gini_Index
## 1 -0.8835424 -0.8455610 -1.5985421     -1.0758032 -0.41742383  0.6045741
## 2 -0.9896879 -0.6801203 -1.9089894     -1.1301460  0.24587367  0.3928943
## 3 -0.5167981 -0.7780284 -1.7402432     -1.0474504 -0.77746434 -0.3104286
## 4 -0.8013864 -0.6522878 -2.2041999     -0.9718430  0.70055341  3.8139116
## 5  1.3458420  0.9456066  0.3556565     -0.1433123 -0.45623962  0.5544993
## 6 -0.6352671 -0.6918467  0.2923291     -0.8017264  0.03849034  1.7995401
##     AI_Score
## 1 -0.9947824
## 2 -1.4596193
## 3 -0.9286605
## 4 -0.9392400
## 5  1.2088947
## 6 -0.5322598

# Load necessary packages
library(dplyr)
library(ggplot2)
library(tidyr)

# Detach plyr if loaded (to prevent conflict with dplyr)
if ("package:plyr" %in% search()) {
  detach("package:plyr", unload = TRUE)
}

# Define numeric columns
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")

# Pivot the data to long format for plotting
long_data <- cleaned_data_scaled %>%
  select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
  pivot_longer(cols = all_of(numeric_cols), names_to = "Indicator", values_to = "Value")

# --- BOXPLOT BY REGION ---
ggplot(long_data, aes(x = Region, y = Value, fill = Region)) +
  geom_boxplot() +
  facet_wrap(~ Indicator, scales = "free_y") +
  theme_minimal() +
  labs(title = "Boxplot of AI Indicators by Region", y = "Value", x = "Region") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# --- BOXPLOT BY INCOME GROUP ---
ggplot(long_data, aes(x = `Income group`, y = Value, fill = `Income group`)) +
  geom_boxplot() +
  facet_wrap(~ Indicator, scales = "free_y") +
  theme_minimal() +
  labs(title = "Boxplot of AI Indicators by Income Group", y = "Value", x = "Income Group") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# --- BOXPLOT BY POLITICAL REGIME ---
ggplot(long_data, aes(x = `Political regime`, y = Value, fill = `Political regime`)) +
  geom_boxplot() +
  facet_wrap(~ Indicator, scales = "free_y") +
  theme_minimal() +
  labs(title = "Boxplot of AI Indicators by Political Regime", y = "Value", x = "Political Regime") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

1.2 Building the models

# 6. Split data for modeling
set.seed(131017)
train_index <- createDataPartition(cleaned_data_scaled$AI_Score, p = 0.8, list = FALSE)
train_data <- cleaned_data_scaled[train_index, ]
test_data <- cleaned_data_scaled[-train_index, ]

# Load necessary library for PLSR if not already
library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:corrplot':
## 
##     corrplot

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

# 7. Prepare modeling data and remove NAs
model_data <- cleaned_data_scaled %>%
  select(AI_Score, Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index) %>%
  na.omit()

# Split into training and testing sets
set.seed(131017)
train_index <- createDataPartition(model_data$AI_Score, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data  <- model_data[-train_index, ]

# -------------------------------
# 8. Partial Least Squares Regression (PLSR)
set.seed(131017)
plsr_model <- plsr(
  AI_Score ~ ., 
  data = train_data, 
  scale = TRUE, 
  validation = "CV"
)

# Predict using 2 components (adjustable)
plsr_pred <- predict(plsr_model, newdata = test_data, ncomp = 2)
plsr_rmse <- sqrt(mean((plsr_pred - test_data$AI_Score)^2))
print(paste("PLSR RMSE (2 components):", round(plsr_rmse, 3)))

## [1] "PLSR RMSE (2 components): 0.315"

# Optional: Full component RMSE
plsr_pred_all <- predict(plsr_model, newdata = test_data, ncomp = plsr_model$ncomp)
plsr_rmse_all <- sqrt(mean((plsr_pred_all - test_data$AI_Score)^2))
print(paste("PLSR RMSE (all components):", round(plsr_rmse_all, 3)))

## [1] "PLSR RMSE (all components): 0.299"

# -------------------------------
# 9. Random Forest
set.seed(131017)
rf_model <- randomForest(
  AI_Score ~ ., 
  data = train_data, 
  importance = TRUE
)
rf_pred <- predict(rf_model, newdata = test_data)
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
print(paste("Random Forest RMSE:", round(rf_rmse, 3)))

## [1] "Random Forest RMSE: 0.295"

# -------------------------------
# 10. Neural Network (MLP)
set.seed(131017)
nn_model <- nnet(
  AI_Score ~ ., 
  data = train_data, 
  size = 5,       # Number of hidden units
  linout = TRUE,  # Regression output
  decay = 0.01,   # Weight decay (regularization)
  maxit = 500     # Maximum iterations
)

## # weights:  41
## initial  value 53.352396 
## iter  10 value 2.305113
## iter  20 value 0.940506
## iter  30 value 0.679601
## iter  40 value 0.638504
## iter  50 value 0.629446
## iter  60 value 0.626957
## iter  70 value 0.626213
## iter  80 value 0.626007
## iter  90 value 0.625953
## iter 100 value 0.625911
## iter 110 value 0.625904
## iter 120 value 0.625903
## final  value 0.625903 
## converged

nn_pred <- predict(nn_model, newdata = test_data)
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
print(paste("Neural Network RMSE:", round(nn_rmse, 3)))

## [1] "Neural Network RMSE: 0.226"

plsr_model

## Partial least squares regression, fitted with the kernel algorithm.
## Cross-validated using 10 random segments.
## Call:
## plsr(formula = AI_Score ~ ., data = train_data, scale = TRUE,     validation = "CV")

print(paste("PLSR RMSE (2 components):", round(plsr_rmse, 3)))

## [1] "PLSR RMSE (2 components): 0.315"

# Model Performance: The random forest model explains 71.17% of the variance, with a mean squared residual of 0.1762446.
rf_model

## 
## Call:
##  randomForest(formula = AI_Score ~ ., data = train_data, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 0.1762446
##                     % Var explained: 71.17

print(paste("Random Forest RMSE:", round(rf_rmse, 3)))

## [1] "Random Forest RMSE: 0.295"

# Extract variable importance from the trained model
rf_varimp <- randomForest::importance(rf_model)

# Convert to data frame
rf_varimp_df <- as.data.frame(rf_varimp) %>%
  rownames_to_column(var = "Variable")

# Check the result
print(head(rf_varimp_df))

##         Variable   %IncMSE IncNodePurity
## 1         Talent 13.223601      6.236898
## 2            RnD 15.731153      8.084483
## 3         Policy  6.488547      3.902219
## 4 GDP_Per_Capita 10.131607      4.465569
## 5            GDP  1.353361      1.229903
## 6     Gini_Index  4.596936      1.585927

rf_varimp <- varImp(rf_model, scale = TRUE)
print(rf_varimp)

##                  Overall
## Talent         13.223601
## RnD            15.731153
## Policy          6.488547
## GDP_Per_Capita 10.131607
## GDP             1.353361
## Gini_Index      4.596936

library(ggplot2)

# Create the data frame
importance_df <- data.frame(
  Variable = c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index"),
  Importance = c(13.223601, 15.731153, 6.488547, 10.131607, 1.353361, 4.596936)
)

# Plot
ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_col(fill = "#2E86C1", width = 0.7) +
  coord_flip() +
  labs(
    title = "Variable Importance",
    x = "Variable",
    y = "Importance Score"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    axis.text = element_text(color = "black"),
    panel.grid.major.y = element_blank()
  )

nn_model

## a 6-5-1 network with 41 weights
## inputs: Talent RnD Policy GDP_Per_Capita GDP Gini_Index 
## output(s): AI_Score 
## options were - linear output units  decay=0.01

print(paste("Neural Network RMSE:", round(nn_rmse, 3)))

## [1] "Neural Network RMSE: 0.226"

# Load required packages
library(dplyr)
library(gt)

# Store model RMSEs
model_performance <- tibble(
  Model = c("PLSR (2 Components)", "PLSR (All Components)", "Random Forest", "Neural Network"),
  RMSE = c(
    round(plsr_rmse, 3),
    round(plsr_rmse_all, 3),
    round(rf_rmse, 3),
    round(nn_rmse, 3)
  )
)

# Create professional publication-style table
performance_table <- model_performance %>%
  gt() %>%
  tab_header(
    title = md("**Model Comparison: Predictive Accuracy on AI Competitiveness**"),
    subtitle = "Root Mean Square Error (RMSE) on Test Set"
  ) %>%
  cols_label(
    Model = "Model Type",
    RMSE = "Test RMSE"
  ) %>%
  fmt_number(columns = RMSE, decimals = 3) %>%
  tab_options(
    table.font.names = "Times New Roman",
    table.border.top.width = px(2),
    heading.align = "center",
    column_labels.font.weight = "bold"
  )

# View the table
performance_table

Model Type	Test RMSE
Model Comparison: Predictive Accuracy on AI Competitiveness
Root Mean Square Error (RMSE) on Test Set
PLSR (2 Components)	0.315
PLSR (All Components)	0.299
Random Forest	0.295
Neural Network	0.226

1.3 Model Training & Evaluation

1.3.1 Partial Least Squares Regression (PLSR)

set.seed(131017)
plsr_model <- plsr(AI_Score ~ ., data = train_data, scale = TRUE, validation = "CV")
plsr_pred <- predict(plsr_model, newdata = test_data, ncomp = 2)
plsr_rmse <- sqrt(mean((plsr_pred - test_data$AI_Score)^2))
plsr_pred_all <- predict(plsr_model, newdata = test_data, ncomp = plsr_model$ncomp)
plsr_rmse_all <- sqrt(mean((plsr_pred_all - test_data$AI_Score)^2))
plsr_rmse_all

## [1] 0.2986293

1.3.2 Random Forest

set.seed(131017)
rf_model <- randomForest(AI_Score ~ ., data = train_data, importance = TRUE)
rf_pred <- predict(rf_model, newdata = test_data)
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
rf_rmse

## [1] 0.2953213

1.3.3 Neural Network (MLP)

set.seed(131017)
nn_model <- nnet(
  AI_Score ~ ., 
  data = train_data, 
  size = 5, 
  linout = TRUE, 
  decay = 0.01, 
  maxit = 500
)

## # weights:  41
## initial  value 53.352396 
## iter  10 value 2.305113
## iter  20 value 0.940506
## iter  30 value 0.679601
## iter  40 value 0.638504
## iter  50 value 0.629446
## iter  60 value 0.626957
## iter  70 value 0.626213
## iter  80 value 0.626007
## iter  90 value 0.625953
## iter 100 value 0.625911
## iter 110 value 0.625904
## iter 120 value 0.625903
## final  value 0.625903 
## converged

nn_pred <- predict(nn_model, newdata = test_data)
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
nn_rmse

## [1] 0.2260537

1.4 Model Performance Summary

model_performance <- tibble(
  Model = c("PLSR (2 Components)", "PLSR (All Components)", "Random Forest", "Neural Network"),
  RMSE = c(round(plsr_rmse, 3), round(plsr_rmse_all, 3), round(rf_rmse, 3), round(nn_rmse, 3))
)

model_performance %>%
  gt() %>%
  tab_header(
    title = md("**Model Comparison: Predictive Accuracy on AI Competitiveness**"),
    subtitle = "Root Mean Square Error (RMSE) on Test Set"
  ) %>%
  cols_label(Model = "Model Type", RMSE = "Test RMSE") %>%
  fmt_number(columns = RMSE, decimals = 3) %>%
  tab_options(
    table.font.names = "Times New Roman",
    table.border.top.width = px(2),
    heading.align = "center",
    column_labels.font.weight = "bold"
  )

Model Type	Test RMSE
Model Comparison: Predictive Accuracy on AI Competitiveness
Root Mean Square Error (RMSE) on Test Set
PLSR (2 Components)	0.315
PLSR (All Components)	0.299
Random Forest	0.295
Neural Network	0.226

# Load libraries (caret last to avoid train() masking)
library(dplyr)
library(gt)
library(caret)
library(pls)
library(randomForest)
library(nnet)
library(doParallel)

## Loading required package: foreach

## Loading required package: iterators

## Loading required package: parallel

# Create parallel backend for faster CV
cl <- makePSOCKcluster(2)  # Use 2 cores (adjust if needed)
registerDoParallel(cl)

# --- PLSR Model ---
# Cross-validated RMSE (2 components)
plsr_cv_rmse <- RMSEP(plsr_model, estimate = "CV")$val[1, 1, 2]  # 2 comps
# Correct prediction extraction
plsr_pred_vec <- plsr_pred[, 1, 1]
plsr_rmse <- sqrt(mean((plsr_pred_vec - test_data$AI_Score)^2))
plsr_r2 <- cor(plsr_pred_vec, test_data$AI_Score)^2
plsr_accuracy <- mean(sign(plsr_pred_vec) == sign(test_data$AI_Score))

# --- Random Forest Model ---
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
rf_cv_rmse <- sqrt(min(rf_model$mse))  # OOB estimate
rf_r2 <- cor(rf_pred, test_data$AI_Score)^2
rf_accuracy <- mean(sign(rf_pred) == sign(test_data$AI_Score))

# --- Neural Network with caret CV ---
set.seed(131017)
nn_ctrl <- trainControl(method = "cv", number = 5)
nn_cv <- caret::train(
  AI_Score ~ ., 
  data = train_data, 
  method = "nnet", 
  linout = TRUE,
  trace = FALSE,
  tuneGrid = expand.grid(size = 5, decay = 0.01),
  maxit = 500,
  trControl = nn_ctrl
)
nn_cv_rmse <- nn_cv$results$RMSE[1]
# Predict on test data using base model (not caret)
nn_pred <- predict(nn_model, newdata = test_data)
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
nn_r2 <- cor(nn_pred, test_data$AI_Score)^2
nn_accuracy <- mean(sign(nn_pred) == sign(test_data$AI_Score))

# Stop parallel backend
stopCluster(cl)

# --- Combine all results into a table ---
model_performance <- data.frame(
  Model = c("Partial Least Squares Regression (PLSR)", 
            "Random Forest", 
            "Neural Network (MLP)"),
  Test_RMSE = c(plsr_rmse, rf_rmse, nn_rmse),
  CV_RMSE = c(plsr_cv_rmse, rf_cv_rmse, nn_cv_rmse),
  R_Squared = c(plsr_r2, rf_r2, nn_r2),
  Accuracy = c(plsr_accuracy, rf_accuracy, nn_accuracy)
) %>%
  mutate(across(-Model, ~ round(., 3)))

# --- Professional Table Output ---
model_performance %>%
  gt() %>%
  tab_header(
    title = md("**Predicting AI_Score: Model Performance with Cross-Validation.**")
  ) %>%
  cols_label(
    Model = "Model",
    Test_RMSE = "Test RMSE",
    CV_RMSE = "Cross-Validated RMSE",
    R_Squared = "R²",
    Accuracy = "Directional Accuracy"
  ) %>%
  fmt_number(columns = c(Test_RMSE, CV_RMSE, R_Squared, Accuracy), decimals = 3) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = "Model")
  ) %>%
  tab_options(
    table.font.size = 12,
    heading.title.font.size = 14,
    heading.title.font.weight = "bold",
    table.width = pct(90)
  )

Model	Test RMSE	Cross-Validated RMSE	R²	Directional Accuracy
Predicting AI_Score: Model Performance with Cross-Validation.
Partial Least Squares Regression (PLSR)	0.315	0.319	0.869	0.875
Random Forest	0.295	0.416	0.896	0.875
Neural Network (MLP)	0.226	0.357	0.927	0.875

# Load libraries
library(dplyr)
library(gt)
library(caret)
library(pls)
library(randomForest)
library(nnet)
library(doParallel)

# Create parallel backend
cl <- makePSOCKcluster(2)  # Adjust number of cores as needed
registerDoParallel(cl)

# --- PLSR Performance ---
plsr_cv <- RMSEP(plsr_model, estimate = "CV")
plsr_cv_rmse <- plsr_cv$val[1, 1, 2]        # Mean CV RMSE for 2 comps
plsr_cv_sd <- sd(plsr_cv$val[1, 1, ])       # Approx SD (across comps)
plsr_pred_vec <- plsr_pred[, 1, 1]
plsr_rmse <- sqrt(mean((plsr_pred_vec - test_data$AI_Score)^2))
plsr_r2 <- cor(plsr_pred_vec, test_data$AI_Score)^2
plsr_accuracy <- mean(sign(plsr_pred_vec) == sign(test_data$AI_Score))

# --- Random Forest Performance ---
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
rf_cv_rmse <- sqrt(min(rf_model$mse))  # OOB estimate
rf_cv_sd <- sd(rf_model$mse)           # SD of OOB MSE (approx)
rf_r2 <- cor(rf_pred, test_data$AI_Score)^2
rf_accuracy <- mean(sign(rf_pred) == sign(test_data$AI_Score))

# --- Neural Network Performance (caret for CV) ---
set.seed(131017)
nn_ctrl <- trainControl(method = "cv", number = 5)
nn_cv <- caret::train(
  AI_Score ~ ., 
  data = train_data, 
  method = "nnet", 
  linout = TRUE,
  trace = FALSE,
  tuneGrid = expand.grid(size = 5, decay = 0.01),
  maxit = 500,
  trControl = nn_ctrl
)
nn_cv_rmse <- nn_cv$results$RMSE[1]
nn_cv_sd <- nn_cv$results$RMSESD[1]
nn_pred <- predict(nn_model, newdata = test_data)
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
nn_r2 <- cor(nn_pred, test_data$AI_Score)^2
nn_accuracy <- mean(sign(nn_pred) == sign(test_data$AI_Score))

# Stop parallel backend
stopCluster(cl)

# --- Compile Results ---
model_performance <- data.frame(
  Model = c("Partial Least Squares Regression (PLSR)", 
            "Random Forest", 
            "Neural Network (MLP)"),
  Test_RMSE = c(plsr_rmse, rf_rmse, nn_rmse),
  CV_RMSE = c(plsr_cv_rmse, rf_cv_rmse, nn_cv_rmse),
  CV_RMSE_SD = c(plsr_cv_sd, rf_cv_sd, nn_cv_sd),
  R_Squared = c(plsr_r2, rf_r2, nn_r2),
  Accuracy = c(plsr_accuracy, rf_accuracy, nn_accuracy)
)

# Round for display
model_performance <- model_performance %>%
  mutate(across(where(is.numeric), ~ round(., 3)))

# Identify best models
best_cv_rmse <- min(model_performance$CV_RMSE)
best_r2 <- max(model_performance$R_Squared)

# --- Create GT Table ---
model_performance %>%
  gt() %>%
  tab_header(
    title = md("**Predicting AI_Score: Model Performance with Cross-Validation and Confidence Intervals. **")
  ) %>%
  cols_label(
    Model = "Model",
    Test_RMSE = "Test RMSE",
    CV_RMSE = "CV RMSE",
    CV_RMSE_SD = "CV RMSE SD",
    R_Squared = "R²",
    Accuracy = "Directional Accuracy"
  ) %>%
  fmt_number(columns = c(Test_RMSE, CV_RMSE, CV_RMSE_SD, R_Squared, Accuracy), decimals = 3) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = "Model")
  ) %>%
  # Highlight best CV RMSE
  tab_style(
    style = list(cell_fill(color = "#d0f0c0"), cell_text(weight = "bold")),
    locations = cells_body(rows = CV_RMSE == best_cv_rmse)
  ) %>%
  # Highlight best R²
  tab_style(
    style = list(cell_fill(color = "#cce5ff"), cell_text(weight = "bold")),
    locations = cells_body(rows = R_Squared == best_r2)
  ) %>%
  tab_options(
    table.font.size = 12,
    heading.title.font.size = 14,
    heading.title.font.weight = "bold",
    table.width = pct(90)
  )

Model	Test RMSE	CV RMSE	CV RMSE SD	R²	Directional Accuracy
Predicting AI_Score: Model Performance with Cross-Validation and Confidence Intervals.
Partial Least Squares Regression (PLSR)	0.315	0.319	0.194	0.869	0.875
Random Forest	0.295	0.416	0.015	0.896	0.875
Neural Network (MLP)	0.226	0.357	0.165	0.927	0.875

1.5 Conclusion

Among the models evaluated, the model with the lowest RMSE offers the best generalization performance for predicting AI competitiveness. The Random Forest and PLSR models perform almost comparably, but each has trade-offs in interpretability and flexibility. Further work may incorporate additional covariates, longitudinal data, or ensemble approaches.

# Load necessary libraries
library(dplyr)
library(gt)
library(randomForest)
library(pls)
library(nnet)
library(caret)

# Extract the correct PLSR predictions
plsr_pred_vec <- plsr_pred[, 1, 1]  # prediction for 2 components

# ---- PLSR metrics ----
plsr_r2 <- cor(plsr_pred_vec, test_data$AI_Score)^2
plsr_rmse <- sqrt(mean((plsr_pred_vec - test_data$AI_Score)^2))
plsr_accuracy <- mean(sign(plsr_pred_vec) == sign(test_data$AI_Score))  # directional accuracy

# ---- Random Forest metrics ----
rf_r2 <- cor(rf_pred, test_data$AI_Score)^2
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
rf_accuracy <- mean(sign(rf_pred) == sign(test_data$AI_Score))

# ---- Neural Network metrics ----
nn_r2 <- cor(nn_pred, test_data$AI_Score)^2
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
nn_accuracy <- mean(sign(nn_pred) == sign(test_data$AI_Score))

# ---- Combine all metrics into a data frame ----
model_performance <- data.frame(
  Model = c("Partial Least Squares Regression (PLSR)", 
            "Random Forest", 
            "Neural Network (MLP)"),
  RMSE = c(plsr_rmse, rf_rmse, nn_rmse),
  R_Squared = c(plsr_r2, rf_r2, nn_r2),
  Accuracy = c(plsr_accuracy, rf_accuracy, nn_accuracy)
)

# Round values for better presentation
model_performance <- model_performance %>%
  mutate(across(c(RMSE, R_Squared, Accuracy), ~ round(., 3)))

# ---- Create professional table using gt ----
model_performance %>%
  gt() %>%
  tab_header(
    title = md("**Predicting AI_Score: Model Performance Metrics **")
  ) %>%
  cols_label(
    Model = "Model",
    RMSE = "RMSE",
    R_Squared = "R²",
    Accuracy = "Directional Accuracy"
  ) %>%
  fmt_number(columns = c(RMSE, R_Squared, Accuracy), decimals = 3) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = "Model")
  ) %>%
  tab_options(
    table.font.size = 12,
    heading.title.font.size = 14,
    heading.title.font.weight = "bold",
    table.width = pct(90)
  )

Model	RMSE	R²	Directional Accuracy
Predicting AI_Score: Model Performance Metrics
Partial Least Squares Regression (PLSR)	0.315	0.869	0.875
Random Forest	0.295	0.896	0.875
Neural Network (MLP)	0.226	0.927	0.875

Untitled

Heleine Fouda

2025-04-28