To be used Untitled

# Load libraries
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift
library(ggplot2)
library(pls)

Attaching package: 'pls'

The following object is masked from 'package:caret':

    R2

The following object is masked from 'package:stats':

    loadings
library(randomForest)
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:dplyr':

    combine

The following object is masked from 'package:ggplot2':

    margin
library(nnet)
library(GGally)
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
library(scales)

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor
library(ggResidpanel)


# Load the datasets
ai_data <- read.csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/AI_index_db.csv")

group_data <- read.csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/gdp_gini_data.csv", header = FALSE)
colnames(group_data) <- c("Raw")
group_data <- group_data %>%
  separate(Raw, into = c("Country", "GDP_Per_Capita", "Gini", "GDP"), sep = ",")

# Merge datasets by Country
ai_merged <- left_join(ai_data, group_data, by = "Country")

# Convert GDP_Per_Capita to numeric (cleaning if needed)
ai_merged <- ai_merged %>%
  mutate(GDP_Per_Capita = parse_number(GDP_Per_Capita))

# Create RnD as combination of Talent and Research
ai_merged <- ai_merged %>%
  mutate(RnD = Talent + Research)


# EDA - Boxplots by income and geopolitical group
ggplot(ai_merged, aes(x = `Income.group`, y = Total.score, fill = `Income.group`)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "AI Score by Income Group") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(ai_merged, aes(x = `Political.regime`, y = Total.score, fill = `Political.regime`)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "AI Score by Political.regime") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(ai_merged, aes(x = `Region`, y = Total.score, fill = `Region`)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "AI Score by Region") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(ai_merged, aes(x = `Cluster`, y = Total.score, fill = `Cluster`)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "AI Score by Cluster") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(ai_merged, aes(x = `Government.Strategy`, y = Total.score, fill = `Region`)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "AI Score by Government.Strategy") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(ai_merged, aes(x = `Operating.Environment`, y = Total.score, fill = `Region`)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "AI Score by Operating.Environment") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Standardize predictors

# Standardize predictors
ai_scaled <- ai_merged %>% 
  mutate(across(c(Talent, Research, Government.Strategy, Infrastructure, Commercial, Development, GDP_Per_Capita, RnD), as.numeric)) %>%
  mutate(across(c(Talent, Research, Government.Strategy, Infrastructure, Commercial, Development, GDP_Per_Capita, RnD), scale))

# Linear model: Total.score ~ Talent
lm1 <- lm(Total.score ~ Talent, data = ai_scaled)
summary(lm1)

Call:
lm(formula = Total.score ~ Talent, data = ai_scaled)

Residuals:
    Min      1Q  Median      3Q     Max 
-17.945  -3.306  -1.246   2.845  39.256 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  23.9147     0.9818   24.36   <2e-16 ***
Talent       13.0361     0.9898   13.17   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.731 on 60 degrees of freedom
Multiple R-squared:  0.743, Adjusted R-squared:  0.7387 
F-statistic: 173.5 on 1 and 60 DF,  p-value: < 2.2e-16
resid_panel(lm1)

# Multiple linear regression: Total.score ~ Talent + Research
lm2 <- lm(Total.score ~ Talent + Research, data = ai_scaled)
summary(lm2)

Call:
lm(formula = Total.score ~ Talent + Research, data = ai_scaled)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.4453  -2.7429   0.5836   3.1185   9.0231 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  23.9147     0.5480  43.642  < 2e-16 ***
Talent        4.2078     0.9426   4.464 3.69e-05 ***
Research     10.8956     0.9426  11.559  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.315 on 59 degrees of freedom
Multiple R-squared:  0.9213,    Adjusted R-squared:  0.9186 
F-statistic: 345.2 on 2 and 59 DF,  p-value: < 2.2e-16
resid_panel(lm2)

# Prepare data for machine learning
set.seed(131017)
splitIndex <- createDataPartition(ai_scaled$Total.score, p = 0.8, list = FALSE)
train_data <- ai_scaled[splitIndex, ]
test_data <- ai_scaled[-splitIndex, ]

# Define formula
ml_formula <- as.formula("Total.score ~ RnD + Government.Strategy + Infrastructure + Commercial + Development + GDP_Per_Capita")

# Prepare data for machine learning
set.seed(131017)

# Remove rows with missing values from the entire dataset
ai_scaled <- na.omit(ai_scaled)

# Split the data into training and testing sets
splitIndex <- createDataPartition(ai_scaled$Total.score, p = 0.8, list = FALSE)
train_data <- ai_scaled[splitIndex, ]
test_data <- ai_scaled[-splitIndex, ]

# Ensure no missing values in train and test data
train_data <- na.omit(train_data)
test_data <- na.omit(test_data)

# Define the formula for modeling
ml_formula <- as.formula("Total.score ~ RnD + Government.Strategy + Infrastructure + Commercial + Development + GDP_Per_Capita")

# Partial Least Squares Regression
pls_model <- caret::train(ml_formula, data = train_data, method = "pls", tuneLength = 10, preProcess = c("center", "scale"))
pls_pred <- predict(pls_model, test_data)
pls_perf <- postResample(pls_pred, test_data$Total.score)
print(pls_perf)
    RMSE Rsquared      MAE 
1.656422 0.990346 1.281423 
print(pls_model$bestTune)
  ncomp
5     5
# Ensure all variables used in the model are numeric and free of missing values
train_data <- na.omit(train_data)
test_data <- na.omit(test_data)

# Make sure Total.score is numeric
train_data$Total.score <- as.numeric(train_data$Total.score)
test_data$Total.score <- as.numeric(test_data$Total.score)

# Define the formula for modeling
ml_formula <- as.formula("Total.score ~ RnD + Government.Strategy + Infrastructure + Commercial + Development + GDP_Per_Capita")

# Random Forest Model
rf_model <- caret::train(ml_formula, data = train_data, method = "rf", tuneLength = 5, preProcess = c("center", "scale"))

# Predict using the Random Forest model
rf_pred <- predict(rf_model, test_data)

# Evaluate model performance
rf_perf <- postResample(rf_pred, test_data$Total.score)

# Print the performance metrics
print(rf_perf)
     RMSE  Rsquared       MAE 
8.6975707 0.8805077 5.5964486 
# Best tuning parameters for Random Forest
print(rf_model$bestTune)
  mtry
2    3
# Ensure there are no missing values in the data
train_data <- na.omit(train_data)
test_data <- na.omit(test_data)

# Make sure Total.score is numeric
train_data$Total.score <- as.numeric(train_data$Total.score)
test_data$Total.score <- as.numeric(test_data$Total.score)

# Define the formula for modeling
ml_formula <- as.formula("Total.score ~ RnD + Government.Strategy + Infrastructure + Commercial + Development + GDP_Per_Capita")

# Neural Network Model
nn_model <- caret::train(ml_formula, data = train_data, method = "nnet", linout = TRUE, trace = FALSE, tuneLength = 5, preProcess = c("center", "scale"))

# Predict using the Neural Network model
nn_pred <- predict(nn_model, test_data)

# Evaluate model performance
nn_perf <- postResample(nn_pred, test_data$Total.score)

# Print the performance metrics
print(nn_perf)
     RMSE  Rsquared       MAE 
5.8617679 0.9360665 3.1039732 
# Best tuning parameters for the Neural Network
print(nn_model$bestTune)
  size decay
4    1  0.01
# Compare model performances
results <- data.frame(
  Model = c("PLSR", "Random Forest", "Neural Network"),
  RMSE = c(pls_perf["RMSE"], rf_perf["RMSE"], nn_perf["RMSE"]),
  Rsquared = c(pls_perf["Rsquared"], rf_perf["Rsquared"], nn_perf["Rsquared"])
)
print(results)
           Model     RMSE  Rsquared
1           PLSR 1.656422 0.9903460
2  Random Forest 8.697571 0.8805077
3 Neural Network 5.861768 0.9360665
# Visualize predicted vs actual values
pred_df <- data.frame(
  Actual = test_data$Total.score,
  PLSR = pls_pred,
  RF = rf_pred,
  NN = nn_pred
) %>%
  pivot_longer(-Actual, names_to = "Model", values_to = "Predicted")

ggplot(pred_df, aes(x = Actual, y = Predicted, color = Model)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  theme_minimal() +
  labs(title = "Predicted vs Actual AI Scores", x = "Actual", y = "Predicted")

# Train the Random Forest model
rf_model <- caret::train(ml_formula, data = train_data, method = "rf", tuneLength = 5, preProcess = c("center", "scale"))

# Get variable importance from the trained model
rf_importance <- varImp(rf_model, scale = FALSE)

# Plot the variable importance
ggplot(rf_importance, aes(x = reorder(rownames(rf_importance), Overall), y = Overall)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Variable Importance from Random Forest", x = "Predictors", y = "Importance") +
  theme_minimal()
Coordinate system already present. Adding new coordinate system, which will
replace the existing one.

Code for the various tables:

library(dplyr)
library(gt)

# Create data frame with regression output
reg_results <- tibble::tibble(
  Term = c("(Intercept)", "Talent"),
  Estimate = c(23.9147, 13.0361),
  `Std. Error` = c(0.9818, 0.9898),
  `t value` = c(24.36, 13.17),
  `Pr(>|t|)` = c("< 2e-16 ***", "< 2e-16 ***")
)

# Create residual and model fit summary
residual_summary <- "Residuals: Min = -17.945, 1Q = -3.306, Median = -1.246, 3Q = 2.845, Max = 39.256"
fit_summary <- "Residual SE = 7.731 on 60 DF; R² = 0.743; Adj. R² = 0.7387; F(1, 60) = 173.5; p < 2.2e-16"

# Generate gt table
reg_results %>%
  gt() %>%
  tab_header(
    title = "Regression Results: Total AI Score ~ Talent"
  ) %>%
  fmt_number(columns = c(Estimate, `Std. Error`, `t value`), decimals = 3) %>%
  cols_label(
    Term = "Predictor",
    Estimate = "Estimate",
    `Std. Error` = "Std. Error",
    `t value` = "t value",
    `Pr(>|t|)` = "Pr(>|t|)"
  ) %>%
  tab_footnote(
    footnote = residual_summary,
    locations = cells_title(groups = "title")
  ) %>%
  tab_source_note(
    source_note = fit_summary
  )
Regression Results: Total AI Score ~ Talent1
Predictor Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.915 0.982 24.360 < 2e-16 ***
Talent 13.036 0.990 13.170 < 2e-16 ***
Residual SE = 7.731 on 60 DF; R² = 0.743; Adj. R² = 0.7387; F(1, 60) = 173.5; p < 2.2e-16
1 Residuals: Min = -17.945, 1Q = -3.306, Median = -1.246, 3Q = 2.845, Max = 39.256
library(dplyr)
library(gt)

# Create data frame with regression output
reg_results_mlr <- tibble::tibble(
  Term = c("(Intercept)", "Talent", "Research"),
  Estimate = c(23.9147, 4.2078, 10.8956),
  `Std. Error` = c(0.5480, 0.9426, 0.9426),
  `t value` = c(43.642, 4.464, 11.559),
  `Pr(>|t|)` = c("< 2e-16 ***", "3.69e-05 ***", "< 2e-16 ***")
)

# Create residual and model fit summary
residual_summary_mlr <- "Residuals: Min = -12.4453, 1Q = -2.7429, Median = 0.5836, 3Q = 3.1185, Max = 9.0231"
fit_summary_mlr <- "Residual SE = 4.315 on 59 DF; R² = 0.9213; Adj. R² = 0.9186; F(2, 59) = 345.2; p < 2.2e-16"

# Generate gt table
reg_results_mlr %>%
  gt() %>%
  tab_header(
    title = "Regression Results: Total AI Score ~ Talent + Research"
  ) %>%
  fmt_number(columns = c(Estimate, `Std. Error`, `t value`), decimals = 3) %>%
  cols_label(
    Term = "Predictor",
    Estimate = "Estimate",
    `Std. Error` = "Std. Error",
    `t value` = "t value",
    `Pr(>|t|)` = "Pr(>|t|)"
  ) %>%
  tab_footnote(
    footnote = residual_summary_mlr,
    locations = cells_title(groups = "title")
  ) %>%
  tab_source_note(
    source_note = fit_summary_mlr
  )
Regression Results: Total AI Score ~ Talent + Research1
Predictor Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.915 0.548 43.642 < 2e-16 ***
Talent 4.208 0.943 4.464 3.69e-05 ***
Research 10.896 0.943 11.559 < 2e-16 ***
Residual SE = 4.315 on 59 DF; R² = 0.9213; Adj. R² = 0.9186; F(2, 59) = 345.2; p < 2.2e-16
1 Residuals: Min = -12.4453, 1Q = -2.7429, Median = 0.5836, 3Q = 3.1185, Max = 9.0231