This report presents a comparative analysis of three regression models used to predict national-level AI competitiveness scores. The models include:
The analysis evaluates the models based on their Root Mean Square Error (RMSE) on a standardized test dataset.
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(readr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# Load dataset
Ai_index <- readr::read_csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/AI_index_db.csv")
## Rows: 62 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Country, Region, Cluster, Income group, Political regime
## dbl (8): Talent, Infrastructure, Operating Environment, Research, Developmen...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gdp_raw <- readr::read_csv(
"https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/gdp_gini_data.csv",
col_names = "raw"
)
## Rows: 210 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): raw
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gdp_data <- gdp_raw %>%
tidyr::separate(raw, into = c("Country", "GDP_Per_Capita", "Gini_Index", "GDP"), sep = ",", fill = "right") %>%
mutate(across(everything(), trimws)) %>%
mutate(
GDP_Per_Capita = as.numeric(gsub("[^0-9.]", "", GDP_Per_Capita)),
Gini_Index = as.numeric(gsub("[^0-9.]", "", Gini_Index)),
GDP = as.numeric(gsub("[^0-9.]", "", GDP))
)
Ai_index <- Ai_index %>% mutate(Country = trimws(Country))
merged_data <- Ai_index %>%
left_join(gdp_data, by = "Country")
cleaned_data <- merged_data %>%
mutate(
RnD = (Research + Development) / 2,
Policy = `Government Strategy`,
AI_Score = `Total score`
) %>%
select(Country, Region, `Income group`, `Political regime`,
Talent, RnD, Policy, AI_Score,
GDP_Per_Capita, GDP, Gini_Index) %>%
filter(!is.na(AI_Score))
numeric_vars <- cleaned_data %>%
select(where(is.numeric)) %>%
scale(center = TRUE, scale = TRUE) %>%
as.data.frame()
cleaned_data_scaled <- bind_cols(
cleaned_data %>% select(-where(is.numeric)),
numeric_vars
)
model_data <- cleaned_data_scaled %>%
select(AI_Score, Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index) %>%
na.omit()
set.seed(131017)
train_index <- createDataPartition(model_data$AI_Score, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
# 1. Load necessary libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(caret)
library(corrplot)
## corrplot 0.95 loaded
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(nnet)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggcorrplot)
library(ggthemes)
library(tensorflow)
##
## Attaching package: 'tensorflow'
## The following object is masked from 'package:caret':
##
## train
# 2. Load AI Index dataset
Ai_index <- readr::read_csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/AI_index_db.csv")
## Rows: 62 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Country, Region, Cluster, Income group, Political regime
## dbl (8): Talent, Infrastructure, Operating Environment, Research, Developmen...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 3. Load GDP and Gini data (1-column CSV, needs parsing)
gdp_raw <- readr::read_csv(
"https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/gdp_gini_data.csv",
col_names = "raw"
)
## Rows: 210 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): raw
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 4. Clean and separate GDP data
gdp_data <- gdp_raw %>%
separate(raw, into = c("Country", "GDP_Per_Capita", "Gini_Index", "GDP"), sep = ",", fill = "right") %>%
mutate(across(everything(), trimws)) %>%
mutate(
GDP_Per_Capita = as.numeric(gsub("[^0-9.]", "", GDP_Per_Capita)),
Gini_Index = as.numeric(gsub("[^0-9.]", "", Gini_Index)),
GDP = as.numeric(gsub("[^0-9.]", "", GDP))
)
# 5. Clean AI Index country names
Ai_index <- Ai_index %>%
mutate(Country = trimws(Country))
# 6. Merge datasets
merged_data <- Ai_index %>%
left_join(gdp_data, by = "Country")
# 7. Clean and rename relevant variables
cleaned_data <- merged_data %>%
mutate(
RnD = (Research + Development) / 2, # Average of Research and Development
Policy = `Government Strategy`, # Rename for clarity
AI_Score = `Total score` # Rename for modeling
) %>%
select(
Country, Region, `Income group`, `Political regime`,
Talent, RnD, Policy, AI_Score,
GDP_Per_Capita, GDP, Gini_Index
)
# 8. Filter out rows with missing AI_Score (essential for modeling)
cleaned_data <- cleaned_data %>%
filter(!is.na(AI_Score))
# 9. Standardize numeric variables using Z-scores
numeric_vars <- cleaned_data %>%
select(where(is.numeric)) %>%
scale(center = TRUE, scale = TRUE) %>%
as.data.frame()
# 10. Combine standardized numerics with categorical columns
cleaned_data_scaled <- bind_cols(
cleaned_data %>% select(-where(is.numeric)),
numeric_vars
)
# 11. Create AI Competitiveness Tiers
cleaned_data_scaled <- cleaned_data_scaled %>%
mutate(Tier = case_when(
AI_Score >= quantile(AI_Score, 0.67, na.rm = TRUE) ~ "Leader",
AI_Score >= quantile(AI_Score, 0.33, na.rm = TRUE) ~ "Follower",
TRUE ~ "Laggard"
))
# 12. Quick data checks
glimpse(cleaned_data_scaled)
## Rows: 62
## Columns: 12
## $ Country <chr> "United States of America", "China", "United Kingdo…
## $ Region <chr> "Americas", "Asia-Pacific", "Europe", "Americas", "…
## $ `Income group` <chr> "High", "Upper middle", "High", "High", "High", "Hi…
## $ `Political regime` <chr> "Liberal democracy", "Closed autocracy", "Liberal d…
## $ Talent <dbl> 5.46809990, -0.01926160, 1.50160970, 0.95149333, 1.…
## $ RnD <dbl> 4.76303426, 3.38949496, 0.85038281, 0.70684075, 0.8…
## $ Policy <dbl> 0.74371558, 1.40955823, 0.95055343, 1.60496858, -0.…
## $ AI_Score <dbl> 5.0309049, 2.5791054, 1.1250852, 1.0761550, 1.05631…
## $ GDP_Per_Capita <dbl> NA, -0.83007921, 0.64426449, 0.65844087, 0.43161876…
## $ GDP <dbl> NA, -0.8959691, -0.9568674, -0.9609821, 1.1782757, …
## $ Gini_Index <dbl> NA, 0.468006500, 0.003676819, -0.242144777, 0.53629…
## $ Tier <chr> "Leader", "Leader", "Leader", "Leader", "Leader", "…
table(cleaned_data_scaled$Tier)
##
## Follower Laggard Leader
## 20 21 21
summary(cleaned_data_scaled)
## Country Region Income group Political regime
## Length:62 Length:62 Length:62 Length:62
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Talent RnD Policy AI_Score
## Min. :-1.1044 Min. :-0.8710 Min. :-2.2042 Min. :-1.58128
## 1st Qu.:-0.6203 1st Qu.:-0.6926 1st Qu.:-0.6413 1st Qu.:-0.60235
## Median :-0.2207 Median :-0.3252 Median : 0.2310 Median :-0.04593
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.5103 3rd Qu.: 0.4011 3rd Qu.: 0.7651 3rd Qu.: 0.43461
## Max. : 5.4681 Max. : 4.7630 Max. : 1.6050 Max. : 5.03090
##
## GDP_Per_Capita GDP Gini_Index Tier
## Min. :-1.1467 Min. :-0.9651 Min. :-1.4303 Length:62
## 1st Qu.:-0.8088 1st Qu.:-0.9244 1st Qu.:-0.6553 Class :character
## Median :-0.3245 Median :-0.4174 Median :-0.2285 Mode :character
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6868 3rd Qu.: 0.7073 3rd Qu.: 0.4851
## Max. : 3.0259 Max. : 2.7583 Max. : 3.8139
## NA's :3 NA's :3 NA's :10
# 13. (Optional) Save the cleaned dataset
# readr::write_csv(cleaned_data_scaled, "cleaned_ai_gdp_data.csv")
table(cleaned_data_scaled$Tier)
##
## Follower Laggard Leader
## 20 21 21
# Generate summary stats per Tier
summary_stats <- cleaned_data_scaled %>%
group_by(Tier) %>%
summarise(
Talent_Mean = mean(Talent, na.rm = TRUE),
Talent_SD = sd(Talent, na.rm = TRUE),
RnD_Mean = mean(RnD, na.rm = TRUE),
RnD_SD = sd(RnD, na.rm = TRUE),
Policy_Mean = mean(Policy, na.rm = TRUE),
Policy_SD = sd(Policy, na.rm = TRUE),
GDP_Per_Capita_Mean = mean(GDP_Per_Capita, na.rm = TRUE),
GDP_Per_Capita_SD = sd(GDP_Per_Capita, na.rm = TRUE),
GDP_Mean = mean(GDP, na.rm = TRUE),
GDP_SD = sd(GDP, na.rm = TRUE),
Gini_Index_Mean = mean(Gini_Index, na.rm = TRUE),
Gini_Index_SD = sd(Gini_Index, na.rm = TRUE),
.groups = "drop"
)
# Create journal-style table
summary_stats %>%
kable(format = "html", digits = 2, caption = "Descriptive Statistics by AI Competitiveness Tier") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Tier | Talent_Mean | Talent_SD | RnD_Mean | RnD_SD | Policy_Mean | Policy_SD | GDP_Per_Capita_Mean | GDP_Per_Capita_SD | GDP_Mean | GDP_SD | Gini_Index_Mean | Gini_Index_SD |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Follower | -0.20 | 0.38 | -0.23 | 0.34 | 0.24 | 0.72 | 0.05 | 0.76 | -0.07 | 0.96 | -0.48 | 0.97 |
| Laggard | -0.72 | 0.21 | -0.75 | 0.12 | -0.73 | 0.97 | -0.77 | 0.57 | 0.14 | 0.98 | 0.63 | 1.10 |
| Leader | 0.92 | 1.18 | 0.97 | 1.14 | 0.50 | 0.86 | 0.80 | 0.95 | -0.09 | 1.09 | -0.25 | 0.43 |
# Run ANOVA and tidy results
vars <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index")
anova_results <- lapply(vars, function(var) {
formula <- as.formula(paste(var, "~ Tier"))
aov_model <- aov(formula, data = cleaned_data_scaled)
tidy_res <- summary(aov_model)[[1]]
data.frame(Variable = var,
F = tidy_res$`F value`[1],
p_value = tidy_res$`Pr(>F)`[1])
})
anova_table <- bind_rows(anova_results)
# Display in table format
anova_table %>%
mutate(Significance = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
TRUE ~ ""
)) %>%
kable(format = "html", digits = 3, caption = "ANOVA Results by AI Tier") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
| Variable | F | p_value | Significance |
|---|---|---|---|
| Talent | 27.497 | 0.000 | *** |
| RnD | 33.531 | 0.000 | *** |
| Policy | 12.010 | 0.000 | *** |
| GDP_Per_Capita | 20.756 | 0.000 | *** |
| GDP | 0.310 | 0.735 | |
| Gini_Index | 7.704 | 0.001 | ** |
anova_table
## Variable F p_value
## 1 Talent 27.4965605 3.649340e-09
## 2 RnD 33.5311907 1.874398e-10
## 3 Policy 12.0103149 4.208687e-05
## 4 GDP_Per_Capita 20.7563143 1.801194e-07
## 5 GDP 0.3096422 7.349575e-01
## 6 Gini_Index 7.7037319 1.232728e-03
# Load libraries
library(gt)
library(dplyr)
# Create the data frame
ai_significance_table <- data.frame(
Variable = c("RnD", "Talent", "GDP/Capita", "Policy", "Gini Index", "GDP (total)"),
`F-Statistic` = c(33.53, 27.50, 20.76, 12.01, 7.70, 0.31),
`p-value` = c(1.87e-10, 3.65e-09, 1.80e-07, 4.21e-05, 0.00123, 0.73496),
check.names = FALSE # preserve column names with hyphens
)
# Generate the table
ai_significance_table %>%
gt() %>%
tab_header(
title = md("**Statistical Significance of Selected Predictors with Respect to AI_Score**")
) %>%
fmt_number(
columns = all_of("F-Statistic"),
decimals = 2
) %>%
fmt_scientific(
columns = all_of("p-value"),
decimals = 2
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = "Variable")
) %>%
cols_label(
Variable = "Variable",
`F-Statistic` = "F-Statistic",
`p-value` = "p-value"
) %>%
tab_options(
table.font.size = 12,
table.width = pct(90),
heading.title.font.size = 14,
heading.title.font.weight = "bold"
)
| Statistical Significance of Selected Predictors with Respect to AI_Score | ||
| Variable | F-Statistic | p-value |
|---|---|---|
| RnD | 33.53 | 1.87 × 10−10 |
| Talent | 27.50 | 3.65 × 10−9 |
| GDP/Capita | 20.76 | 1.80 × 10−7 |
| Policy | 12.01 | 4.21 × 10−5 |
| Gini Index | 7.70 | 1.23 × 10−3 |
| GDP (total) | 0.31 | 7.35 × 10−1 |
# Load libraries
library(gt)
library(dplyr)
# Create the data frame with significance stars
ai_significance_table <- data.frame(
Variable = c("RnD", "Talent", "GDP/Capita", "Policy", "Gini Index", "GDP (total)"),
`F-Statistic` = c(33.53, 27.50, 20.76, 12.01, 7.70, 0.31),
`p-value` = c(1.87e-10, 3.65e-09, 1.80e-07, 4.21e-05, 0.00123, 0.73496),
check.names = FALSE
) %>%
mutate(
Significance = case_when(
`p-value` < 0.001 ~ "***",
`p-value` < 0.01 ~ "**",
`p-value` < 0.05 ~ "*",
TRUE ~ ""
),
`p-value` = paste0(formatC(`p-value`, format = "e", digits = 2), " ", Significance)
) %>%
select(-Significance) # Remove intermediate column
# Generate the styled table
ai_significance_table %>%
gt() %>%
tab_header(
title = md("**Statistical Significance of Selected Predictors with Respect to AI_Score**")
) %>%
fmt_number(
columns = all_of("F-Statistic"),
decimals = 2
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = "Variable")
) %>%
cols_label(
Variable = "Variable",
`F-Statistic` = "F-Statistic",
`p-value` = "p-value"
) %>%
tab_options(
table.font.size = 12,
table.width = pct(90),
heading.title.font.size = 14,
heading.title.font.weight = "bold"
)
| Statistical Significance of Selected Predictors with Respect to AI_Score | ||
| Variable | F-Statistic | p-value |
|---|---|---|
| RnD | 33.53 | 1.87e-10 *** |
| Talent | 27.50 | 3.65e-09 *** |
| GDP/Capita | 20.76 | 1.80e-07 *** |
| Policy | 12.01 | 4.21e-05 *** |
| Gini Index | 7.70 | 1.23e-03 ** |
| GDP (total) | 0.31 | 7.35e-01 |
# Melt data for boxplot
plot_data <- cleaned_data_scaled %>%
select(Tier, Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index) %>%
pivot_longer(-Tier, names_to = "Variable", values_to = "Value")
# Boxplot
ggplot(plot_data, aes(x = Tier, y = Value, fill = Tier)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
facet_wrap(~Variable, scales = "free", ncol = 3) +
labs(title = "Distribution of AI Determinants by Tier", x = "AI Competitiveness Tier", y = "Standardized Value") +
theme_minimal() +
theme(legend.position = "none")
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Load necessary packages
library(dplyr)
library(knitr)
library(kableExtra)
# Define numeric columns
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")
# Summarize the data
summary_data <- cleaned_data_scaled %>%
select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
group_by(Region, `Income group`, `Political regime`) %>%
summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
mutate(across(all_of(numeric_cols), ~ round(.x, 2))) # Optional: Round for presentation
# Render LaTeX-safe table
summary_data %>%
kable(format = "latex", booktabs = TRUE, caption = "Summary of AI Indicators by Region, Income Group, and Political Regime") %>%
kable_styling(latex_options = c("striped", "scale_down", "hold_position"))
# Load necessary package
library(ggcorrplot)
# Compute correlation matrix of numeric variables
cor_matrix <- cleaned_data_scaled %>%
select(Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index, AI_Score) %>%
cor(use = "pairwise.complete.obs")
# Plot correlation heatmap
corr_fig <- ggcorrplot(cor_matrix,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method = "circle",
colors = c("red", "white", "blue"),
title = "Correlation Matrix of AI Determinants",
ggtheme = theme_minimal(base_size = 12))
# Display the plot
print(corr_fig)
library(knitr)
library(kableExtra)
library(tibble)
# Original matrix input
cor_matrix <- matrix(c(
1.000, 0.773, 0.322, 0.561, 0.025, -0.294, 0.862,
0.773, 1.000, 0.422, 0.446, -0.102, -0.208, 0.941,
0.322, 0.422, 1.000, 0.184, -0.158, -0.224, 0.532,
0.561, 0.446, 0.184, 1.000, 0.183, -0.417, 0.547,
0.025, -0.102, -0.158, 0.183, 1.000, -0.019, -0.105,
-0.294, -0.208, -0.224, -0.417, -0.019, 1.000, -0.257,
0.862, 0.941, 0.532, 0.547, -0.105, -0.257, 1.000
), nrow = 7, byrow = TRUE)
vars <- c("Talent", "RnD", "Policy", "GDP_PC", "GDP", "Gini", "AI_Score")
dimnames(cor_matrix) <- list(vars, vars)
# Keep only lower triangle and blank out upper
lower_tri <- cor_matrix
lower_tri[upper.tri(lower_tri)] <- ""
# Convert to data frame, round values, and format
cor_df <- as.data.frame(lower_tri)
cor_df <- apply(cor_df, 2, function(x) ifelse(x == "", "", sprintf("%.3f", as.numeric(x))))
cor_df <- as.data.frame(cor_df)
cor_df <- rownames_to_column(cor_df, " ")
# Display clean table
kable(cor_df, format = "html", escape = FALSE,
caption = "Table: Correlation Matrix of AI Competitiveness Indicators (Lower Triangle)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = FALSE)
| Talent | RnD | Policy | GDP_PC | GDP | Gini | AI_Score | |
|---|---|---|---|---|---|---|---|
| Talent | 1.000 | ||||||
| RnD | 0.773 | 1.000 | |||||
| Policy | 0.322 | 0.422 | 1.000 | ||||
| GDP_PC | 0.561 | 0.446 | 0.184 | 1.000 | |||
| GDP | 0.025 | -0.102 | -0.158 | 0.183 | 1.000 | ||
| Gini | -0.294 | -0.208 | -0.224 | -0.417 | -0.019 | 1.000 | |
| AI_Score | 0.862 | 0.941 | 0.532 | 0.547 | -0.105 | -0.257 | 1.000 |
# Save correlation heatmap
ggsave("AI_Correlation_Matrix.png", plot = corr_fig, width = 8, height = 6, dpi = 300)
# Run ANOVA and Tukey HSD for each numeric variable
tukey_results <- list()
for (var in c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")) {
aov_model <- aov(scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
tukey_results[[var]] <- TukeyHSD(aov_model)
print(paste("Tukey HSD for:", var))
print(tukey_results[[var]])
}
## [1] "Tukey HSD for: Talent"
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
##
## $`cleaned_data_scaled$Tier`
## diff lwr upr p adj
## Laggard-Follower -0.5228938 -1.0723991 0.02661154 0.0653824
## Leader-Follower 1.1168768 0.5673714 1.66638209 0.0000243
## Leader-Laggard 1.6397706 1.0970079 2.18253323 0.0000000
##
## [1] "Tukey HSD for: RnD"
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
##
## $`cleaned_data_scaled$Tier`
## diff lwr upr p adj
## Laggard-Follower -0.5115422 -1.0340809 0.01099655 0.0562536
## Leader-Follower 1.2021358 0.6795971 1.72467456 0.0000023
## Leader-Laggard 1.7136780 1.1975511 2.22980498 0.0000000
##
## [1] "Tukey HSD for: Policy"
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
##
## $`cleaned_data_scaled$Tier`
## diff lwr upr p adj
## Laggard-Follower -0.9735183 -1.6174180 -0.3296186 0.0016712
## Leader-Follower 0.2575506 -0.3863491 0.9014503 0.6037905
## Leader-Laggard 1.2310688 0.5950701 1.8670676 0.0000556
##
## [1] "Tukey HSD for: GDP_Per_Capita"
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
##
## $`cleaned_data_scaled$Tier`
## diff lwr upr p adj
## Laggard-Follower -0.8162374 -1.4041393 -0.2283356 0.0041734
## Leader-Follower 0.7550789 0.1526587 1.3574990 0.0105316
## Leader-Laggard 1.5713163 0.9834144 2.1592182 0.0000001
##
## [1] "Tukey HSD for: GDP"
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
##
## $`cleaned_data_scaled$Tier`
## diff lwr upr p adj
## Laggard-Follower 0.20488252 -0.5666479 0.9764130 0.7991098
## Leader-Follower -0.02182983 -0.8124133 0.7687536 0.9975666
## Leader-Laggard -0.22671234 -0.9982428 0.5448181 0.7601101
##
## [1] "Tukey HSD for: Gini_Index"
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
##
## $`cleaned_data_scaled$Tier`
## diff lwr upr p adj
## Laggard-Follower 1.1035916 0.3738370 1.8333462 0.0017832
## Leader-Follower 0.2226252 -0.5264961 0.9717465 0.7539105
## Leader-Laggard -0.8809664 -1.5989750 -0.1629579 0.0127160
##
## [1] "Tukey HSD for: AI_Score"
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = scale(cleaned_data_scaled[[var]]) ~ cleaned_data_scaled$Tier)
##
## $`cleaned_data_scaled$Tier`
## diff lwr upr p adj
## Laggard-Follower -0.8025245 -1.296882 -0.3081668 7.12e-04
## Leader-Follower 1.0302797 0.535922 1.5246374 1.55e-05
## Leader-Laggard 1.8328042 1.344512 2.3210959 0.00e+00
library(knitr)
library(kableExtra)
# Define the Tukey HSD results as a data frame
tukey_data <- data.frame(
Variable = c("Talent", "Talent", "Talent", "Talent",
"RnD", "RnD", "RnD", "RnD",
"Policy", "Policy", "Policy", "Policy",
"GDP_Per_Capita", "GDP_Per_Capita", "GDP_Per_Capita", "GDP_Per_Capita",
"GDP", "GDP", "GDP", "GDP",
"Gini_Index", "Gini_Index", "Gini_Index", "Gini_Index",
"AI_Score", "AI_Score", "AI_Score", "AI_Score"),
Comparison = c("Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
"Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
"Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
"Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
"Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
"Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj",
"Laggard-Follower", "Leader-Follower", "Leader-Laggard", "p adj"),
Diff = c(-0.5228938, 1.1168768, 1.6397706, 0.0653824,
-0.5115422, 1.2021358, 1.7136780, 0.0562536,
-0.9735183, 0.2575506, 1.2310688, 0.6037905,
-0.8162374, 0.7550789, 1.5713163, 0.0105316,
0.20488252, -0.02182983, -0.22671234, 0.7991098,
1.1035916, 0.2226252, -0.8809664, 0.7539105,
-0.8025245, 1.0302797, 1.8328042, 1.55e-05),
Lwr = c(-1.0723991, 0.5673714, 1.0970079, -1.296882,
-1.0340809, 0.6795971, 1.1975511, -1.296882,
-1.6174180, -0.3863491, 0.5950701, -1.296882,
-1.4041393, 0.1526587, 0.9834144, -0.2283356,
-0.5666479, -0.8124133, -0.9982428, -0.2283356,
0.3738370, -0.5264961, -1.5989750, -0.5264961,
-1.296882, 0.535922, 1.344512, 0.535922),
Upr = c(0.02661154, 1.66638209, 2.18253323, 0.02661154,
0.01099655, 1.72467456, 2.22980498, 1.72467456,
-0.3296186, 0.9014503, 1.8670676, 0.9014503,
-0.2283356, 1.3574990, 2.1592182, 1.3574990,
0.9764130, 0.7687536, 0.5448181, 0.9764130,
1.8333462, 0.9717465, -0.1629579, 1.8333462,
-0.3081668, 1.5246374, 2.3210959, 1.5246374),
P_Adj = c(0.0653824, 0.0000243, 0.0000000, 7.12e-04,
0.0562536, 0.0000023, 0.0000000, 0.0000023,
0.0016712, 0.6037905, 0.0000556, 0.6037905,
0.0041734, 0.0105316, 0.0000001, 0.0105316,
0.7991098, 0.9975666, 0.7601101, 0.9975666,
0.0017832, 0.7539105, 0.0127160, 0.7539105,
7.12e-04, 1.55e-05, 0.00e+00, 1.55e-05)
)
# Create the table
tukey_data %>%
kable("html", caption = "Tukey HSD Test Results for Variables by Tier") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
column_spec(1, width = "1.5in") %>%
column_spec(2, width = "2.5in") %>%
column_spec(3, width = "1in") %>%
column_spec(4, width = "1in") %>%
column_spec(5, width = "1in") %>%
column_spec(6, width = "1in")
| Variable | Comparison | Diff | Lwr | Upr | P_Adj |
|---|---|---|---|---|---|
| Talent | Laggard-Follower | -0.5228938 | -1.0723991 | 0.0266115 | 0.0653824 |
| Talent | Leader-Follower | 1.1168768 | 0.5673714 | 1.6663821 | 0.0000243 |
| Talent | Leader-Laggard | 1.6397706 | 1.0970079 | 2.1825332 | 0.0000000 |
| Talent | p adj | 0.0653824 | -1.2968820 | 0.0266115 | 0.0007120 |
| RnD | Laggard-Follower | -0.5115422 | -1.0340809 | 0.0109966 | 0.0562536 |
| RnD | Leader-Follower | 1.2021358 | 0.6795971 | 1.7246746 | 0.0000023 |
| RnD | Leader-Laggard | 1.7136780 | 1.1975511 | 2.2298050 | 0.0000000 |
| RnD | p adj | 0.0562536 | -1.2968820 | 1.7246746 | 0.0000023 |
| Policy | Laggard-Follower | -0.9735183 | -1.6174180 | -0.3296186 | 0.0016712 |
| Policy | Leader-Follower | 0.2575506 | -0.3863491 | 0.9014503 | 0.6037905 |
| Policy | Leader-Laggard | 1.2310688 | 0.5950701 | 1.8670676 | 0.0000556 |
| Policy | p adj | 0.6037905 | -1.2968820 | 0.9014503 | 0.6037905 |
| GDP_Per_Capita | Laggard-Follower | -0.8162374 | -1.4041393 | -0.2283356 | 0.0041734 |
| GDP_Per_Capita | Leader-Follower | 0.7550789 | 0.1526587 | 1.3574990 | 0.0105316 |
| GDP_Per_Capita | Leader-Laggard | 1.5713163 | 0.9834144 | 2.1592182 | 0.0000001 |
| GDP_Per_Capita | p adj | 0.0105316 | -0.2283356 | 1.3574990 | 0.0105316 |
| GDP | Laggard-Follower | 0.2048825 | -0.5666479 | 0.9764130 | 0.7991098 |
| GDP | Leader-Follower | -0.0218298 | -0.8124133 | 0.7687536 | 0.9975666 |
| GDP | Leader-Laggard | -0.2267123 | -0.9982428 | 0.5448181 | 0.7601101 |
| GDP | p adj | 0.7991098 | -0.2283356 | 0.9764130 | 0.9975666 |
| Gini_Index | Laggard-Follower | 1.1035916 | 0.3738370 | 1.8333462 | 0.0017832 |
| Gini_Index | Leader-Follower | 0.2226252 | -0.5264961 | 0.9717465 | 0.7539105 |
| Gini_Index | Leader-Laggard | -0.8809664 | -1.5989750 | -0.1629579 | 0.0127160 |
| Gini_Index | p adj | 0.7539105 | -0.5264961 | 1.8333462 | 0.7539105 |
| AI_Score | Laggard-Follower | -0.8025245 | -1.2968820 | -0.3081668 | 0.0007120 |
| AI_Score | Leader-Follower | 1.0302797 | 0.5359220 | 1.5246374 | 0.0000155 |
| AI_Score | Leader-Laggard | 1.8328042 | 1.3445120 | 2.3210959 | 0.0000000 |
| AI_Score | p adj | 0.0000155 | 0.5359220 | 1.5246374 | 0.0000155 |
library(psych)
library(corrplot)
# Numeric data
numeric_data <- cleaned_data_scaled %>%
select(Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index, AI_Score)
# Correlation and p-values
cor_result <- corr.test(numeric_data, use = "pairwise.complete.obs", method = "pearson")
# Plot significant correlation matrix
corrplot(cor_result$r,
method = "color",
type = "lower",
p.mat = cor_result$p,
sig.level = 0.05,
insig = "blank",
tl.cex = 0.8,
addCoef.col = "black",
col = colorRampPalette(c("red", "white", "blue"))(200),
title = "Correlation Matrix (Significant at p < 0.05)",
mar = c(0,0,2,0))
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Pairwise scatterplots with Tier as color
ggpairs(cleaned_data_scaled %>%
select(Tier, Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index, AI_Score),
mapping = aes(color = Tier, alpha = 0.7),
title = "Pairwise Scatterplots of Key AI Indicators")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
4. Heatmap of Standardized Variables by Country
library(reshape2)
library(pheatmap)
# Unload plyr if it is loaded
if("package:plyr" %in% search()) {
detach("package:plyr", unload = TRUE)
}
# Ensure the numeric column names are correctly defined
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")
# Step 1: Summarize the numeric indicators by grouping variables
summary_data <- cleaned_data_scaled %>%
select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
group_by(Region, `Income group`, `Political regime`) %>%
summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop")
# Check the result
head(summary_data)
## # A tibble: 6 × 10
## Region `Income group` `Political regime` Talent RnD Policy GDP_Per_Capita
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Africa Lower middle Closed autocracy -0.884 -0.846 -1.60 -1.08
## 2 Africa Lower middle Electoral autocra… -0.990 -0.680 -1.91 -1.13
## 3 Africa Lower middle Electoral democra… -0.517 -0.778 -1.74 -1.05
## 4 Africa Upper middle Electoral democra… -0.801 -0.652 -2.20 -0.972
## 5 Americas High Liberal democracy 1.35 0.946 0.356 -0.143
## 6 Americas Upper middle Electoral democra… -0.635 -0.692 0.292 -0.802
## # ℹ 3 more variables: GDP <dbl>, Gini_Index <dbl>, AI_Score <dbl>
# Check for package conflicts (Make sure plyr is not loaded)
if("package:plyr" %in% search()) {
detach("package:plyr", unload = TRUE)
}
# Ensure numeric columns are correctly defined
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")
# Summarize numeric indicators by grouping variables
summary_data <- cleaned_data_scaled %>%
select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
group_by(Region, `Income group`, `Political regime`) %>%
dplyr::summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop")
# Check the result
head(summary_data)
## # A tibble: 6 × 10
## Region `Income group` `Political regime` Talent RnD Policy GDP_Per_Capita
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Africa Lower middle Closed autocracy -0.884 -0.846 -1.60 -1.08
## 2 Africa Lower middle Electoral autocra… -0.990 -0.680 -1.91 -1.13
## 3 Africa Lower middle Electoral democra… -0.517 -0.778 -1.74 -1.05
## 4 Africa Upper middle Electoral democra… -0.801 -0.652 -2.20 -0.972
## 5 Americas High Liberal democracy 1.35 0.946 0.356 -0.143
## 6 Americas Upper middle Electoral democra… -0.635 -0.692 0.292 -0.802
## # ℹ 3 more variables: GDP <dbl>, Gini_Index <dbl>, AI_Score <dbl>
# Load required packages
library(dplyr)
library(gt)
# Create the summary table
summary_table <- cleaned_data_scaled %>%
select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
group_by(Region, `Income group`, `Political regime`) %>%
summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop")
# Generate a professional table with gt
summary_table %>%
gt() %>%
tab_header(
title = "Summary of AI Indicators by Region, Income Group, and Political Regime",
subtitle = "Mean values of standardized indicators"
) %>%
fmt_number(columns = numeric_cols, decimals = 2) %>%
cols_label(
Region = "Region",
`Income group` = "Income Group",
`Political regime` = "Political Regime",
Talent = "Talent",
RnD = "R&D",
Policy = "Policy",
GDP_Per_Capita = "GDP per Capita",
GDP = "GDP",
Gini_Index = "Gini Index",
AI_Score = "AI Score"
) %>%
tab_options(
table.font.size = px(12),
heading.title.font.size = px(16),
heading.subtitle.font.size = px(12),
column_labels.font.weight = "bold"
)
| Summary of AI Indicators by Region, Income Group, and Political Regime | |||||||||
| Mean values of standardized indicators | |||||||||
| Region | Income Group | Political Regime | Talent | R&D | Policy | GDP per Capita | GDP | Gini Index | AI Score |
|---|---|---|---|---|---|---|---|---|---|
| Africa | Lower middle | Closed autocracy | −0.88 | −0.85 | −1.60 | −1.08 | −0.42 | 0.60 | −0.99 |
| Africa | Lower middle | Electoral autocracy | −0.99 | −0.68 | −1.91 | −1.13 | 0.25 | 0.39 | −1.46 |
| Africa | Lower middle | Electoral democracy | −0.52 | −0.78 | −1.74 | −1.05 | −0.78 | −0.31 | −0.93 |
| Africa | Upper middle | Electoral democracy | −0.80 | −0.65 | −2.20 | −0.97 | 0.70 | 3.81 | −0.94 |
| Americas | High | Liberal democracy | 1.35 | 0.95 | 0.36 | −0.14 | −0.46 | 0.55 | 1.21 |
| Americas | Upper middle | Electoral democracy | −0.64 | −0.69 | 0.29 | −0.80 | 0.04 | 1.80 | −0.53 |
| Asia-Pacific | High | Electoral democracy | 0.77 | 0.53 | −0.05 | 0.98 | 0.73 | NaN | 0.66 |
| Asia-Pacific | High | Liberal democracy | 0.09 | 0.80 | 0.43 | 0.60 | −0.71 | −0.30 | 0.45 |
| Asia-Pacific | Lower middle | Closed autocracy | −0.69 | −0.82 | 0.42 | −1.10 | 0.71 | 0.09 | −0.81 |
| Asia-Pacific | Lower middle | Electoral autocracy | 0.65 | −0.14 | −0.82 | −1.14 | −0.19 | −0.19 | −0.58 |
| Asia-Pacific | Lower middle | Electoral democracy | −0.72 | −0.81 | −0.38 | −1.04 | −0.81 | 0.50 | −0.98 |
| Asia-Pacific | Upper middle | Closed autocracy | −0.02 | 3.39 | 1.41 | −0.83 | −0.90 | 0.47 | 2.58 |
| Asia-Pacific | Upper middle | Electoral autocracy | −0.42 | −0.71 | −0.39 | −0.80 | 0.70 | 0.82 | −0.48 |
| Europe | High | Electoral autocracy | −0.42 | −0.61 | −0.11 | −0.62 | −0.23 | −0.75 | −0.46 |
| Europe | High | Electoral democracy | −0.26 | −0.42 | 0.41 | −0.28 | 0.34 | −0.52 | −0.17 |
| Europe | High | Liberal democracy | 0.45 | 0.27 | 0.17 | 0.99 | 0.09 | −0.53 | 0.36 |
| Europe | Upper middle | Electoral autocracy | −0.29 | 0.06 | 1.24 | −0.80 | −0.96 | 0.33 | −0.13 |
| Europe | Upper middle | Electoral democracy | −0.66 | −0.87 | −1.66 | −1.02 | −0.89 | −0.71 | −1.02 |
| Middle East | High | Closed autocracy | −0.91 | −0.44 | −0.07 | 0.35 | −0.16 | −1.24 | −0.37 |
| Middle East | High | Liberal democracy | 1.25 | 0.82 | −0.53 | 0.43 | 1.18 | 0.54 | 1.06 |
| Middle East | Lower middle | Electoral autocracy | −1.03 | −0.79 | 0.41 | −1.09 | 0.99 | −0.49 | −1.26 |
# Load required packages
library(dplyr)
library(knitr)
library(kableExtra)
# Create the summary table
summary_table <- cleaned_data_scaled %>%
select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
group_by(Region, `Income group`, `Political regime`) %>%
summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop")
# Display with kableExtra
summary_table %>%
mutate(across(all_of(numeric_cols), ~ round(.x, 2))) %>%
kable("html", caption = "Summary of AI Indicators by Region, Income Group, and Political Regime") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center")
| Region | Income group | Political regime | Talent | RnD | Policy | GDP_Per_Capita | GDP | Gini_Index | AI_Score |
|---|---|---|---|---|---|---|---|---|---|
| Africa | Lower middle | Closed autocracy | -0.88 | -0.85 | -1.60 | -1.08 | -0.42 | 0.60 | -0.99 |
| Africa | Lower middle | Electoral autocracy | -0.99 | -0.68 | -1.91 | -1.13 | 0.25 | 0.39 | -1.46 |
| Africa | Lower middle | Electoral democracy | -0.52 | -0.78 | -1.74 | -1.05 | -0.78 | -0.31 | -0.93 |
| Africa | Upper middle | Electoral democracy | -0.80 | -0.65 | -2.20 | -0.97 | 0.70 | 3.81 | -0.94 |
| Americas | High | Liberal democracy | 1.35 | 0.95 | 0.36 | -0.14 | -0.46 | 0.55 | 1.21 |
| Americas | Upper middle | Electoral democracy | -0.64 | -0.69 | 0.29 | -0.80 | 0.04 | 1.80 | -0.53 |
| Asia-Pacific | High | Electoral democracy | 0.77 | 0.53 | -0.05 | 0.98 | 0.73 | NaN | 0.66 |
| Asia-Pacific | High | Liberal democracy | 0.09 | 0.80 | 0.43 | 0.60 | -0.71 | -0.30 | 0.45 |
| Asia-Pacific | Lower middle | Closed autocracy | -0.69 | -0.82 | 0.42 | -1.10 | 0.71 | 0.09 | -0.81 |
| Asia-Pacific | Lower middle | Electoral autocracy | 0.65 | -0.14 | -0.82 | -1.14 | -0.19 | -0.19 | -0.58 |
| Asia-Pacific | Lower middle | Electoral democracy | -0.72 | -0.81 | -0.38 | -1.04 | -0.81 | 0.50 | -0.98 |
| Asia-Pacific | Upper middle | Closed autocracy | -0.02 | 3.39 | 1.41 | -0.83 | -0.90 | 0.47 | 2.58 |
| Asia-Pacific | Upper middle | Electoral autocracy | -0.42 | -0.71 | -0.39 | -0.80 | 0.70 | 0.82 | -0.48 |
| Europe | High | Electoral autocracy | -0.42 | -0.61 | -0.11 | -0.62 | -0.23 | -0.75 | -0.46 |
| Europe | High | Electoral democracy | -0.26 | -0.42 | 0.41 | -0.28 | 0.34 | -0.52 | -0.17 |
| Europe | High | Liberal democracy | 0.45 | 0.27 | 0.17 | 0.99 | 0.09 | -0.53 | 0.36 |
| Europe | Upper middle | Electoral autocracy | -0.29 | 0.06 | 1.24 | -0.80 | -0.96 | 0.33 | -0.13 |
| Europe | Upper middle | Electoral democracy | -0.66 | -0.87 | -1.66 | -1.02 | -0.89 | -0.71 | -1.02 |
| Middle East | High | Closed autocracy | -0.91 | -0.44 | -0.07 | 0.35 | -0.16 | -1.24 | -0.37 |
| Middle East | High | Liberal democracy | 1.25 | 0.82 | -0.53 | 0.43 | 1.18 | 0.54 | 1.06 |
| Middle East | Lower middle | Electoral autocracy | -1.03 | -0.79 | 0.41 | -1.09 | 0.99 | -0.49 | -1.26 |
# Load necessary packages
library(dplyr)
library(tibble)
library(pheatmap)
# Detach plyr if loaded (to prevent conflict with dplyr)
if ("package:plyr" %in% search()) {
detach("package:plyr", unload = TRUE)
}
# Define numeric columns
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")
# Summarize the data by Region
region_summary <- cleaned_data_scaled %>%
group_by(Region) %>%
summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
mutate(Group = paste("Region", Region, sep = "_")) %>%
select(Group, all_of(numeric_cols)) %>%
column_to_rownames("Group") %>%
as.matrix()
# Plot heatmap for Region
pheatmap(region_summary,
scale = "none",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
fontsize = 10,
main = "Heatmap: Standardized AI Metrics by Region")
# Summarize the data by Income group
income_summary <- cleaned_data_scaled %>%
group_by(`Income group`) %>%
summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
mutate(Group = paste("Income", `Income group`, sep = "_")) %>%
select(Group, all_of(numeric_cols)) %>%
column_to_rownames("Group") %>%
as.matrix()
# Plot heatmap for Income group
pheatmap(income_summary,
scale = "none",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
fontsize = 10,
main = "Heatmap: Standardized AI Metrics by Income Group")
# Summarize the data by Political regime
regime_summary <- cleaned_data_scaled %>%
group_by(`Political regime`) %>%
summarise(across(all_of(numeric_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
mutate(Group = paste("Regime", `Political regime`, sep = "_")) %>%
select(Group, all_of(numeric_cols)) %>%
column_to_rownames("Group") %>%
as.matrix()
# Plot heatmap for Political regime
pheatmap(regime_summary,
scale = "none",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
fontsize = 10,
main = "Heatmap: Standardized AI Metrics by Political Regime")
# Add a unique identifier for each row
region_data <- summary_data %>%
mutate(UniqueID = row_number()) %>%
select(UniqueID, `Talent`, `RnD`, `Policy`, `GDP_Per_Capita`, `GDP`, `Gini_Index`, `AI_Score`) %>%
column_to_rownames("UniqueID") %>% # Set unique ID as row names
as.matrix()
# Check the result
head(region_data)
## Talent RnD Policy GDP_Per_Capita GDP Gini_Index
## 1 -0.8835424 -0.8455610 -1.5985421 -1.0758032 -0.41742383 0.6045741
## 2 -0.9896879 -0.6801203 -1.9089894 -1.1301460 0.24587367 0.3928943
## 3 -0.5167981 -0.7780284 -1.7402432 -1.0474504 -0.77746434 -0.3104286
## 4 -0.8013864 -0.6522878 -2.2041999 -0.9718430 0.70055341 3.8139116
## 5 1.3458420 0.9456066 0.3556565 -0.1433123 -0.45623962 0.5544993
## 6 -0.6352671 -0.6918467 0.2923291 -0.8017264 0.03849034 1.7995401
## AI_Score
## 1 -0.9947824
## 2 -1.4596193
## 3 -0.9286605
## 4 -0.9392400
## 5 1.2088947
## 6 -0.5322598
# Load necessary packages
library(dplyr)
library(ggplot2)
library(tidyr)
# Detach plyr if loaded (to prevent conflict with dplyr)
if ("package:plyr" %in% search()) {
detach("package:plyr", unload = TRUE)
}
# Define numeric columns
numeric_cols <- c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index", "AI_Score")
# Pivot the data to long format for plotting
long_data <- cleaned_data_scaled %>%
select(Region, `Income group`, `Political regime`, all_of(numeric_cols)) %>%
pivot_longer(cols = all_of(numeric_cols), names_to = "Indicator", values_to = "Value")
# --- BOXPLOT BY REGION ---
ggplot(long_data, aes(x = Region, y = Value, fill = Region)) +
geom_boxplot() +
facet_wrap(~ Indicator, scales = "free_y") +
theme_minimal() +
labs(title = "Boxplot of AI Indicators by Region", y = "Value", x = "Region") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# --- BOXPLOT BY INCOME GROUP ---
ggplot(long_data, aes(x = `Income group`, y = Value, fill = `Income group`)) +
geom_boxplot() +
facet_wrap(~ Indicator, scales = "free_y") +
theme_minimal() +
labs(title = "Boxplot of AI Indicators by Income Group", y = "Value", x = "Income Group") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# --- BOXPLOT BY POLITICAL REGIME ---
ggplot(long_data, aes(x = `Political regime`, y = Value, fill = `Political regime`)) +
geom_boxplot() +
facet_wrap(~ Indicator, scales = "free_y") +
theme_minimal() +
labs(title = "Boxplot of AI Indicators by Political Regime", y = "Value", x = "Political Regime") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# 6. Split data for modeling
set.seed(131017)
train_index <- createDataPartition(cleaned_data_scaled$AI_Score, p = 0.8, list = FALSE)
train_data <- cleaned_data_scaled[train_index, ]
test_data <- cleaned_data_scaled[-train_index, ]
# Load necessary library for PLSR if not already
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
# 7. Prepare modeling data and remove NAs
model_data <- cleaned_data_scaled %>%
select(AI_Score, Talent, RnD, Policy, GDP_Per_Capita, GDP, Gini_Index) %>%
na.omit()
# Split into training and testing sets
set.seed(131017)
train_index <- createDataPartition(model_data$AI_Score, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
# -------------------------------
# 8. Partial Least Squares Regression (PLSR)
set.seed(131017)
plsr_model <- plsr(
AI_Score ~ .,
data = train_data,
scale = TRUE,
validation = "CV"
)
# Predict using 2 components (adjustable)
plsr_pred <- predict(plsr_model, newdata = test_data, ncomp = 2)
plsr_rmse <- sqrt(mean((plsr_pred - test_data$AI_Score)^2))
print(paste("PLSR RMSE (2 components):", round(plsr_rmse, 3)))
## [1] "PLSR RMSE (2 components): 0.315"
# Optional: Full component RMSE
plsr_pred_all <- predict(plsr_model, newdata = test_data, ncomp = plsr_model$ncomp)
plsr_rmse_all <- sqrt(mean((plsr_pred_all - test_data$AI_Score)^2))
print(paste("PLSR RMSE (all components):", round(plsr_rmse_all, 3)))
## [1] "PLSR RMSE (all components): 0.299"
# -------------------------------
# 9. Random Forest
set.seed(131017)
rf_model <- randomForest(
AI_Score ~ .,
data = train_data,
importance = TRUE
)
rf_pred <- predict(rf_model, newdata = test_data)
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
print(paste("Random Forest RMSE:", round(rf_rmse, 3)))
## [1] "Random Forest RMSE: 0.295"
# -------------------------------
# 10. Neural Network (MLP)
set.seed(131017)
nn_model <- nnet(
AI_Score ~ .,
data = train_data,
size = 5, # Number of hidden units
linout = TRUE, # Regression output
decay = 0.01, # Weight decay (regularization)
maxit = 500 # Maximum iterations
)
## # weights: 41
## initial value 53.352396
## iter 10 value 2.305113
## iter 20 value 0.940506
## iter 30 value 0.679601
## iter 40 value 0.638504
## iter 50 value 0.629446
## iter 60 value 0.626957
## iter 70 value 0.626213
## iter 80 value 0.626007
## iter 90 value 0.625953
## iter 100 value 0.625911
## iter 110 value 0.625904
## iter 120 value 0.625903
## final value 0.625903
## converged
nn_pred <- predict(nn_model, newdata = test_data)
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
print(paste("Neural Network RMSE:", round(nn_rmse, 3)))
## [1] "Neural Network RMSE: 0.226"
plsr_model
## Partial least squares regression, fitted with the kernel algorithm.
## Cross-validated using 10 random segments.
## Call:
## plsr(formula = AI_Score ~ ., data = train_data, scale = TRUE, validation = "CV")
print(paste("PLSR RMSE (2 components):", round(plsr_rmse, 3)))
## [1] "PLSR RMSE (2 components): 0.315"
# Model Performance: The random forest model explains 71.17% of the variance, with a mean squared residual of 0.1762446.
rf_model
##
## Call:
## randomForest(formula = AI_Score ~ ., data = train_data, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 0.1762446
## % Var explained: 71.17
print(paste("Random Forest RMSE:", round(rf_rmse, 3)))
## [1] "Random Forest RMSE: 0.295"
# Extract variable importance from the trained model
rf_varimp <- randomForest::importance(rf_model)
# Convert to data frame
rf_varimp_df <- as.data.frame(rf_varimp) %>%
rownames_to_column(var = "Variable")
# Check the result
print(head(rf_varimp_df))
## Variable %IncMSE IncNodePurity
## 1 Talent 13.223601 6.236898
## 2 RnD 15.731153 8.084483
## 3 Policy 6.488547 3.902219
## 4 GDP_Per_Capita 10.131607 4.465569
## 5 GDP 1.353361 1.229903
## 6 Gini_Index 4.596936 1.585927
rf_varimp <- varImp(rf_model, scale = TRUE)
print(rf_varimp)
## Overall
## Talent 13.223601
## RnD 15.731153
## Policy 6.488547
## GDP_Per_Capita 10.131607
## GDP 1.353361
## Gini_Index 4.596936
library(ggplot2)
# Create the data frame
importance_df <- data.frame(
Variable = c("Talent", "RnD", "Policy", "GDP_Per_Capita", "GDP", "Gini_Index"),
Importance = c(13.223601, 15.731153, 6.488547, 10.131607, 1.353361, 4.596936)
)
# Plot
ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_col(fill = "#2E86C1", width = 0.7) +
coord_flip() +
labs(
title = "Variable Importance",
x = "Variable",
y = "Importance Score"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text = element_text(color = "black"),
panel.grid.major.y = element_blank()
)
nn_model
## a 6-5-1 network with 41 weights
## inputs: Talent RnD Policy GDP_Per_Capita GDP Gini_Index
## output(s): AI_Score
## options were - linear output units decay=0.01
print(paste("Neural Network RMSE:", round(nn_rmse, 3)))
## [1] "Neural Network RMSE: 0.226"
# Load required packages
library(dplyr)
library(gt)
# Store model RMSEs
model_performance <- tibble(
Model = c("PLSR (2 Components)", "PLSR (All Components)", "Random Forest", "Neural Network"),
RMSE = c(
round(plsr_rmse, 3),
round(plsr_rmse_all, 3),
round(rf_rmse, 3),
round(nn_rmse, 3)
)
)
# Create professional publication-style table
performance_table <- model_performance %>%
gt() %>%
tab_header(
title = md("**Model Comparison: Predictive Accuracy on AI Competitiveness**"),
subtitle = "Root Mean Square Error (RMSE) on Test Set"
) %>%
cols_label(
Model = "Model Type",
RMSE = "Test RMSE"
) %>%
fmt_number(columns = RMSE, decimals = 3) %>%
tab_options(
table.font.names = "Times New Roman",
table.border.top.width = px(2),
heading.align = "center",
column_labels.font.weight = "bold"
)
# View the table
performance_table
| Model Comparison: Predictive Accuracy on AI Competitiveness | |
| Root Mean Square Error (RMSE) on Test Set | |
| Model Type | Test RMSE |
|---|---|
| PLSR (2 Components) | 0.315 |
| PLSR (All Components) | 0.299 |
| Random Forest | 0.295 |
| Neural Network | 0.226 |
set.seed(131017)
plsr_model <- plsr(AI_Score ~ ., data = train_data, scale = TRUE, validation = "CV")
plsr_pred <- predict(plsr_model, newdata = test_data, ncomp = 2)
plsr_rmse <- sqrt(mean((plsr_pred - test_data$AI_Score)^2))
plsr_pred_all <- predict(plsr_model, newdata = test_data, ncomp = plsr_model$ncomp)
plsr_rmse_all <- sqrt(mean((plsr_pred_all - test_data$AI_Score)^2))
plsr_rmse_all
## [1] 0.2986293
set.seed(131017)
rf_model <- randomForest(AI_Score ~ ., data = train_data, importance = TRUE)
rf_pred <- predict(rf_model, newdata = test_data)
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
rf_rmse
## [1] 0.2953213
set.seed(131017)
nn_model <- nnet(
AI_Score ~ .,
data = train_data,
size = 5,
linout = TRUE,
decay = 0.01,
maxit = 500
)
## # weights: 41
## initial value 53.352396
## iter 10 value 2.305113
## iter 20 value 0.940506
## iter 30 value 0.679601
## iter 40 value 0.638504
## iter 50 value 0.629446
## iter 60 value 0.626957
## iter 70 value 0.626213
## iter 80 value 0.626007
## iter 90 value 0.625953
## iter 100 value 0.625911
## iter 110 value 0.625904
## iter 120 value 0.625903
## final value 0.625903
## converged
nn_pred <- predict(nn_model, newdata = test_data)
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
nn_rmse
## [1] 0.2260537
model_performance <- tibble(
Model = c("PLSR (2 Components)", "PLSR (All Components)", "Random Forest", "Neural Network"),
RMSE = c(round(plsr_rmse, 3), round(plsr_rmse_all, 3), round(rf_rmse, 3), round(nn_rmse, 3))
)
model_performance %>%
gt() %>%
tab_header(
title = md("**Model Comparison: Predictive Accuracy on AI Competitiveness**"),
subtitle = "Root Mean Square Error (RMSE) on Test Set"
) %>%
cols_label(Model = "Model Type", RMSE = "Test RMSE") %>%
fmt_number(columns = RMSE, decimals = 3) %>%
tab_options(
table.font.names = "Times New Roman",
table.border.top.width = px(2),
heading.align = "center",
column_labels.font.weight = "bold"
)
| Model Comparison: Predictive Accuracy on AI Competitiveness | |
| Root Mean Square Error (RMSE) on Test Set | |
| Model Type | Test RMSE |
|---|---|
| PLSR (2 Components) | 0.315 |
| PLSR (All Components) | 0.299 |
| Random Forest | 0.295 |
| Neural Network | 0.226 |
# Load libraries (caret last to avoid train() masking)
library(dplyr)
library(gt)
library(caret)
library(pls)
library(randomForest)
library(nnet)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
# Create parallel backend for faster CV
cl <- makePSOCKcluster(2) # Use 2 cores (adjust if needed)
registerDoParallel(cl)
# --- PLSR Model ---
# Cross-validated RMSE (2 components)
plsr_cv_rmse <- RMSEP(plsr_model, estimate = "CV")$val[1, 1, 2] # 2 comps
# Correct prediction extraction
plsr_pred_vec <- plsr_pred[, 1, 1]
plsr_rmse <- sqrt(mean((plsr_pred_vec - test_data$AI_Score)^2))
plsr_r2 <- cor(plsr_pred_vec, test_data$AI_Score)^2
plsr_accuracy <- mean(sign(plsr_pred_vec) == sign(test_data$AI_Score))
# --- Random Forest Model ---
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
rf_cv_rmse <- sqrt(min(rf_model$mse)) # OOB estimate
rf_r2 <- cor(rf_pred, test_data$AI_Score)^2
rf_accuracy <- mean(sign(rf_pred) == sign(test_data$AI_Score))
# --- Neural Network with caret CV ---
set.seed(131017)
nn_ctrl <- trainControl(method = "cv", number = 5)
nn_cv <- caret::train(
AI_Score ~ .,
data = train_data,
method = "nnet",
linout = TRUE,
trace = FALSE,
tuneGrid = expand.grid(size = 5, decay = 0.01),
maxit = 500,
trControl = nn_ctrl
)
nn_cv_rmse <- nn_cv$results$RMSE[1]
# Predict on test data using base model (not caret)
nn_pred <- predict(nn_model, newdata = test_data)
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
nn_r2 <- cor(nn_pred, test_data$AI_Score)^2
nn_accuracy <- mean(sign(nn_pred) == sign(test_data$AI_Score))
# Stop parallel backend
stopCluster(cl)
# --- Combine all results into a table ---
model_performance <- data.frame(
Model = c("Partial Least Squares Regression (PLSR)",
"Random Forest",
"Neural Network (MLP)"),
Test_RMSE = c(plsr_rmse, rf_rmse, nn_rmse),
CV_RMSE = c(plsr_cv_rmse, rf_cv_rmse, nn_cv_rmse),
R_Squared = c(plsr_r2, rf_r2, nn_r2),
Accuracy = c(plsr_accuracy, rf_accuracy, nn_accuracy)
) %>%
mutate(across(-Model, ~ round(., 3)))
# --- Professional Table Output ---
model_performance %>%
gt() %>%
tab_header(
title = md("**Predicting AI_Score: Model Performance with Cross-Validation.**")
) %>%
cols_label(
Model = "Model",
Test_RMSE = "Test RMSE",
CV_RMSE = "Cross-Validated RMSE",
R_Squared = "R²",
Accuracy = "Directional Accuracy"
) %>%
fmt_number(columns = c(Test_RMSE, CV_RMSE, R_Squared, Accuracy), decimals = 3) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = "Model")
) %>%
tab_options(
table.font.size = 12,
heading.title.font.size = 14,
heading.title.font.weight = "bold",
table.width = pct(90)
)
| Predicting AI_Score: Model Performance with Cross-Validation. | ||||
| Model | Test RMSE | Cross-Validated RMSE | R² | Directional Accuracy |
|---|---|---|---|---|
| Partial Least Squares Regression (PLSR) | 0.315 | 0.319 | 0.869 | 0.875 |
| Random Forest | 0.295 | 0.416 | 0.896 | 0.875 |
| Neural Network (MLP) | 0.226 | 0.357 | 0.927 | 0.875 |
# Load libraries
library(dplyr)
library(gt)
library(caret)
library(pls)
library(randomForest)
library(nnet)
library(doParallel)
# Create parallel backend
cl <- makePSOCKcluster(2) # Adjust number of cores as needed
registerDoParallel(cl)
# --- PLSR Performance ---
plsr_cv <- RMSEP(plsr_model, estimate = "CV")
plsr_cv_rmse <- plsr_cv$val[1, 1, 2] # Mean CV RMSE for 2 comps
plsr_cv_sd <- sd(plsr_cv$val[1, 1, ]) # Approx SD (across comps)
plsr_pred_vec <- plsr_pred[, 1, 1]
plsr_rmse <- sqrt(mean((plsr_pred_vec - test_data$AI_Score)^2))
plsr_r2 <- cor(plsr_pred_vec, test_data$AI_Score)^2
plsr_accuracy <- mean(sign(plsr_pred_vec) == sign(test_data$AI_Score))
# --- Random Forest Performance ---
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
rf_cv_rmse <- sqrt(min(rf_model$mse)) # OOB estimate
rf_cv_sd <- sd(rf_model$mse) # SD of OOB MSE (approx)
rf_r2 <- cor(rf_pred, test_data$AI_Score)^2
rf_accuracy <- mean(sign(rf_pred) == sign(test_data$AI_Score))
# --- Neural Network Performance (caret for CV) ---
set.seed(131017)
nn_ctrl <- trainControl(method = "cv", number = 5)
nn_cv <- caret::train(
AI_Score ~ .,
data = train_data,
method = "nnet",
linout = TRUE,
trace = FALSE,
tuneGrid = expand.grid(size = 5, decay = 0.01),
maxit = 500,
trControl = nn_ctrl
)
nn_cv_rmse <- nn_cv$results$RMSE[1]
nn_cv_sd <- nn_cv$results$RMSESD[1]
nn_pred <- predict(nn_model, newdata = test_data)
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
nn_r2 <- cor(nn_pred, test_data$AI_Score)^2
nn_accuracy <- mean(sign(nn_pred) == sign(test_data$AI_Score))
# Stop parallel backend
stopCluster(cl)
# --- Compile Results ---
model_performance <- data.frame(
Model = c("Partial Least Squares Regression (PLSR)",
"Random Forest",
"Neural Network (MLP)"),
Test_RMSE = c(plsr_rmse, rf_rmse, nn_rmse),
CV_RMSE = c(plsr_cv_rmse, rf_cv_rmse, nn_cv_rmse),
CV_RMSE_SD = c(plsr_cv_sd, rf_cv_sd, nn_cv_sd),
R_Squared = c(plsr_r2, rf_r2, nn_r2),
Accuracy = c(plsr_accuracy, rf_accuracy, nn_accuracy)
)
# Round for display
model_performance <- model_performance %>%
mutate(across(where(is.numeric), ~ round(., 3)))
# Identify best models
best_cv_rmse <- min(model_performance$CV_RMSE)
best_r2 <- max(model_performance$R_Squared)
# --- Create GT Table ---
model_performance %>%
gt() %>%
tab_header(
title = md("**Predicting AI_Score: Model Performance with Cross-Validation and Confidence Intervals. **")
) %>%
cols_label(
Model = "Model",
Test_RMSE = "Test RMSE",
CV_RMSE = "CV RMSE",
CV_RMSE_SD = "CV RMSE SD",
R_Squared = "R²",
Accuracy = "Directional Accuracy"
) %>%
fmt_number(columns = c(Test_RMSE, CV_RMSE, CV_RMSE_SD, R_Squared, Accuracy), decimals = 3) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = "Model")
) %>%
# Highlight best CV RMSE
tab_style(
style = list(cell_fill(color = "#d0f0c0"), cell_text(weight = "bold")),
locations = cells_body(rows = CV_RMSE == best_cv_rmse)
) %>%
# Highlight best R²
tab_style(
style = list(cell_fill(color = "#cce5ff"), cell_text(weight = "bold")),
locations = cells_body(rows = R_Squared == best_r2)
) %>%
tab_options(
table.font.size = 12,
heading.title.font.size = 14,
heading.title.font.weight = "bold",
table.width = pct(90)
)
| **Predicting AI_Score: Model Performance with Cross-Validation and Confidence Intervals. ** | |||||
| Model | Test RMSE | CV RMSE | CV RMSE SD | R² | Directional Accuracy |
|---|---|---|---|---|---|
| Partial Least Squares Regression (PLSR) | 0.315 | 0.319 | 0.194 | 0.869 | 0.875 |
| Random Forest | 0.295 | 0.416 | 0.015 | 0.896 | 0.875 |
| Neural Network (MLP) | 0.226 | 0.357 | 0.165 | 0.927 | 0.875 |
Among the models evaluated, the model with the lowest RMSE offers the best generalization performance for predicting AI competitiveness. The Random Forest and PLSR models perform almost comparably, but each has trade-offs in interpretability and flexibility. Further work may incorporate additional covariates, longitudinal data, or ensemble approaches.
# Load necessary libraries
library(dplyr)
library(gt)
library(randomForest)
library(pls)
library(nnet)
library(caret)
# Extract the correct PLSR predictions
plsr_pred_vec <- plsr_pred[, 1, 1] # prediction for 2 components
# ---- PLSR metrics ----
plsr_r2 <- cor(plsr_pred_vec, test_data$AI_Score)^2
plsr_rmse <- sqrt(mean((plsr_pred_vec - test_data$AI_Score)^2))
plsr_accuracy <- mean(sign(plsr_pred_vec) == sign(test_data$AI_Score)) # directional accuracy
# ---- Random Forest metrics ----
rf_r2 <- cor(rf_pred, test_data$AI_Score)^2
rf_rmse <- sqrt(mean((rf_pred - test_data$AI_Score)^2))
rf_accuracy <- mean(sign(rf_pred) == sign(test_data$AI_Score))
# ---- Neural Network metrics ----
nn_r2 <- cor(nn_pred, test_data$AI_Score)^2
nn_rmse <- sqrt(mean((nn_pred - test_data$AI_Score)^2))
nn_accuracy <- mean(sign(nn_pred) == sign(test_data$AI_Score))
# ---- Combine all metrics into a data frame ----
model_performance <- data.frame(
Model = c("Partial Least Squares Regression (PLSR)",
"Random Forest",
"Neural Network (MLP)"),
RMSE = c(plsr_rmse, rf_rmse, nn_rmse),
R_Squared = c(plsr_r2, rf_r2, nn_r2),
Accuracy = c(plsr_accuracy, rf_accuracy, nn_accuracy)
)
# Round values for better presentation
model_performance <- model_performance %>%
mutate(across(c(RMSE, R_Squared, Accuracy), ~ round(., 3)))
# ---- Create professional table using gt ----
model_performance %>%
gt() %>%
tab_header(
title = md("**Predicting AI_Score: Model Performance Metrics **")
) %>%
cols_label(
Model = "Model",
RMSE = "RMSE",
R_Squared = "R²",
Accuracy = "Directional Accuracy"
) %>%
fmt_number(columns = c(RMSE, R_Squared, Accuracy), decimals = 3) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = "Model")
) %>%
tab_options(
table.font.size = 12,
heading.title.font.size = 14,
heading.title.font.weight = "bold",
table.width = pct(90)
)
| **Predicting AI_Score: Model Performance Metrics ** | |||
| Model | RMSE | R² | Directional Accuracy |
|---|---|---|---|
| Partial Least Squares Regression (PLSR) | 0.315 | 0.869 | 0.875 |
| Random Forest | 0.295 | 0.896 | 0.875 |
| Neural Network (MLP) | 0.226 | 0.927 | 0.875 |