── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
Loading required package: lattice
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
library(ggplot2)library(pls)
Attaching package: 'pls'
The following object is masked from 'package:caret':
R2
The following object is masked from 'package:stats':
loadings
library(randomForest)
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 'randomForest'
The following object is masked from 'package:dplyr':
combine
The following object is masked from 'package:ggplot2':
margin
library(nnet)library(GGally)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
library(scales)
Attaching package: 'scales'
The following object is masked from 'package:purrr':
discard
The following object is masked from 'package:readr':
col_factor
library(ggResidpanel)# Load the datasetsai_data <-read.csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/AI_index_db.csv")group_data <-read.csv("https://raw.githubusercontent.com/Heleinef/Data-Science-Master_Heleine/refs/heads/main/gdp_gini_data.csv", header =FALSE)colnames(group_data) <-c("Raw")group_data <- group_data %>%separate(Raw, into =c("Country", "GDP_Per_Capita", "Gini", "GDP"), sep =",")# Merge datasets by Countryai_merged <-left_join(ai_data, group_data, by ="Country")# Convert GDP_Per_Capita to numeric (cleaning if needed)ai_merged <- ai_merged %>%mutate(GDP_Per_Capita =parse_number(GDP_Per_Capita))# Create RnD as combination of Talent and Researchai_merged <- ai_merged %>%mutate(RnD = Talent + Research)# EDA - Boxplots by income and geopolitical groupggplot(ai_merged, aes(x =`Income.group`, y = Total.score, fill =`Income.group`)) +geom_boxplot() +theme_minimal() +labs(title ="AI Score by Income Group") +theme(axis.text.x =element_text(angle =45, hjust =1))
ggplot(ai_merged, aes(x =`Political.regime`, y = Total.score, fill =`Political.regime`)) +geom_boxplot() +theme_minimal() +labs(title ="AI Score by Political.regime") +theme(axis.text.x =element_text(angle =45, hjust =1))
ggplot(ai_merged, aes(x =`Region`, y = Total.score, fill =`Region`)) +geom_boxplot() +theme_minimal() +labs(title ="AI Score by Region") +theme(axis.text.x =element_text(angle =45, hjust =1))
ggplot(ai_merged, aes(x =`Cluster`, y = Total.score, fill =`Cluster`)) +geom_boxplot() +theme_minimal() +labs(title ="AI Score by Cluster") +theme(axis.text.x =element_text(angle =45, hjust =1))
ggplot(ai_merged, aes(x =`Government.Strategy`, y = Total.score, fill =`Region`)) +geom_boxplot() +theme_minimal() +labs(title ="AI Score by Government.Strategy") +theme(axis.text.x =element_text(angle =45, hjust =1))
ggplot(ai_merged, aes(x =`Operating.Environment`, y = Total.score, fill =`Region`)) +geom_boxplot() +theme_minimal() +labs(title ="AI Score by Operating.Environment") +theme(axis.text.x =element_text(angle =45, hjust =1))
Call:
lm(formula = Total.score ~ Talent, data = ai_scaled)
Residuals:
Min 1Q Median 3Q Max
-17.945 -3.306 -1.246 2.845 39.256
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.9147 0.9818 24.36 <2e-16 ***
Talent 13.0361 0.9898 13.17 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.731 on 60 degrees of freedom
Multiple R-squared: 0.743, Adjusted R-squared: 0.7387
F-statistic: 173.5 on 1 and 60 DF, p-value: < 2.2e-16
resid_panel(lm1)
# Multiple linear regression: Total.score ~ Talent + Researchlm2 <-lm(Total.score ~ Talent + Research, data = ai_scaled)summary(lm2)
Call:
lm(formula = Total.score ~ Talent + Research, data = ai_scaled)
Residuals:
Min 1Q Median 3Q Max
-12.4453 -2.7429 0.5836 3.1185 9.0231
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.9147 0.5480 43.642 < 2e-16 ***
Talent 4.2078 0.9426 4.464 3.69e-05 ***
Research 10.8956 0.9426 11.559 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.315 on 59 degrees of freedom
Multiple R-squared: 0.9213, Adjusted R-squared: 0.9186
F-statistic: 345.2 on 2 and 59 DF, p-value: < 2.2e-16
resid_panel(lm2)
# Prepare data for machine learningset.seed(131017)splitIndex <-createDataPartition(ai_scaled$Total.score, p =0.8, list =FALSE)train_data <- ai_scaled[splitIndex, ]test_data <- ai_scaled[-splitIndex, ]# Define formulaml_formula <-as.formula("Total.score ~ RnD + Government.Strategy + Infrastructure + Commercial + Development + GDP_Per_Capita")# Prepare data for machine learningset.seed(131017)# Remove rows with missing values from the entire datasetai_scaled <-na.omit(ai_scaled)# Split the data into training and testing setssplitIndex <-createDataPartition(ai_scaled$Total.score, p =0.8, list =FALSE)train_data <- ai_scaled[splitIndex, ]test_data <- ai_scaled[-splitIndex, ]# Ensure no missing values in train and test datatrain_data <-na.omit(train_data)test_data <-na.omit(test_data)# Define the formula for modelingml_formula <-as.formula("Total.score ~ RnD + Government.Strategy + Infrastructure + Commercial + Development + GDP_Per_Capita")# Partial Least Squares Regressionpls_model <- caret::train(ml_formula, data = train_data, method ="pls", tuneLength =10, preProcess =c("center", "scale"))pls_pred <-predict(pls_model, test_data)pls_perf <-postResample(pls_pred, test_data$Total.score)print(pls_perf)
RMSE Rsquared MAE
1.656422 0.990346 1.281423
print(pls_model$bestTune)
ncomp
5 5
# Ensure all variables used in the model are numeric and free of missing valuestrain_data <-na.omit(train_data)test_data <-na.omit(test_data)# Make sure Total.score is numerictrain_data$Total.score <-as.numeric(train_data$Total.score)test_data$Total.score <-as.numeric(test_data$Total.score)# Define the formula for modelingml_formula <-as.formula("Total.score ~ RnD + Government.Strategy + Infrastructure + Commercial + Development + GDP_Per_Capita")# Random Forest Modelrf_model <- caret::train(ml_formula, data = train_data, method ="rf", tuneLength =5, preProcess =c("center", "scale"))# Predict using the Random Forest modelrf_pred <-predict(rf_model, test_data)# Evaluate model performancerf_perf <-postResample(rf_pred, test_data$Total.score)# Print the performance metricsprint(rf_perf)
RMSE Rsquared MAE
8.6975707 0.8805077 5.5964486
# Best tuning parameters for Random Forestprint(rf_model$bestTune)
mtry
2 3
# Ensure there are no missing values in the datatrain_data <-na.omit(train_data)test_data <-na.omit(test_data)# Make sure Total.score is numerictrain_data$Total.score <-as.numeric(train_data$Total.score)test_data$Total.score <-as.numeric(test_data$Total.score)# Define the formula for modelingml_formula <-as.formula("Total.score ~ RnD + Government.Strategy + Infrastructure + Commercial + Development + GDP_Per_Capita")# Neural Network Modelnn_model <- caret::train(ml_formula, data = train_data, method ="nnet", linout =TRUE, trace =FALSE, tuneLength =5, preProcess =c("center", "scale"))# Predict using the Neural Network modelnn_pred <-predict(nn_model, test_data)# Evaluate model performancenn_perf <-postResample(nn_pred, test_data$Total.score)# Print the performance metricsprint(nn_perf)
RMSE Rsquared MAE
5.8617679 0.9360665 3.1039732
# Best tuning parameters for the Neural Networkprint(nn_model$bestTune)
Model RMSE Rsquared
1 PLSR 1.656422 0.9903460
2 Random Forest 8.697571 0.8805077
3 Neural Network 5.861768 0.9360665
# Visualize predicted vs actual valuespred_df <-data.frame(Actual = test_data$Total.score,PLSR = pls_pred,RF = rf_pred,NN = nn_pred) %>%pivot_longer(-Actual, names_to ="Model", values_to ="Predicted")ggplot(pred_df, aes(x = Actual, y = Predicted, color = Model)) +geom_point(alpha =0.6) +geom_abline(slope =1, intercept =0, linetype ="dashed") +theme_minimal() +labs(title ="Predicted vs Actual AI Scores", x ="Actual", y ="Predicted")
# Train the Random Forest modelrf_model <- caret::train(ml_formula, data = train_data, method ="rf", tuneLength =5, preProcess =c("center", "scale"))# Get variable importance from the trained modelrf_importance <-varImp(rf_model, scale =FALSE)# Plot the variable importanceggplot(rf_importance, aes(x =reorder(rownames(rf_importance), Overall), y = Overall)) +geom_bar(stat ="identity", fill ="skyblue") +coord_flip() +labs(title ="Variable Importance from Random Forest", x ="Predictors", y ="Importance") +theme_minimal()
Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
Code for the various tables:
library(dplyr)library(gt)# Create data frame with regression outputreg_results <- tibble::tibble(Term =c("(Intercept)", "Talent"),Estimate =c(23.9147, 13.0361),`Std. Error`=c(0.9818, 0.9898),`t value`=c(24.36, 13.17),`Pr(>|t|)`=c("< 2e-16 ***", "< 2e-16 ***"))# Create residual and model fit summaryresidual_summary <-"Residuals: Min = -17.945, 1Q = -3.306, Median = -1.246, 3Q = 2.845, Max = 39.256"fit_summary <-"Residual SE = 7.731 on 60 DF; R² = 0.743; Adj. R² = 0.7387; F(1, 60) = 173.5; p < 2.2e-16"# Generate gt tablereg_results %>%gt() %>%tab_header(title ="Regression Results: Total AI Score ~ Talent" ) %>%fmt_number(columns =c(Estimate, `Std. Error`, `t value`), decimals =3) %>%cols_label(Term ="Predictor",Estimate ="Estimate",`Std. Error`="Std. Error",`t value`="t value",`Pr(>|t|)`="Pr(>|t|)" ) %>%tab_footnote(footnote = residual_summary,locations =cells_title(groups ="title") ) %>%tab_source_note(source_note = fit_summary )
Regression Results: Total AI Score ~ Talent1
Predictor
Estimate
Std. Error
t value
Pr(>|t|)
(Intercept)
23.915
0.982
24.360
< 2e-16 ***
Talent
13.036
0.990
13.170
< 2e-16 ***
Residual SE = 7.731 on 60 DF; R² = 0.743; Adj. R² = 0.7387; F(1, 60) = 173.5; p < 2.2e-16
1 Residuals: Min = -17.945, 1Q = -3.306, Median = -1.246, 3Q = 2.845, Max = 39.256
library(dplyr)library(gt)# Create data frame with regression outputreg_results_mlr <- tibble::tibble(Term =c("(Intercept)", "Talent", "Research"),Estimate =c(23.9147, 4.2078, 10.8956),`Std. Error`=c(0.5480, 0.9426, 0.9426),`t value`=c(43.642, 4.464, 11.559),`Pr(>|t|)`=c("< 2e-16 ***", "3.69e-05 ***", "< 2e-16 ***"))# Create residual and model fit summaryresidual_summary_mlr <-"Residuals: Min = -12.4453, 1Q = -2.7429, Median = 0.5836, 3Q = 3.1185, Max = 9.0231"fit_summary_mlr <-"Residual SE = 4.315 on 59 DF; R² = 0.9213; Adj. R² = 0.9186; F(2, 59) = 345.2; p < 2.2e-16"# Generate gt tablereg_results_mlr %>%gt() %>%tab_header(title ="Regression Results: Total AI Score ~ Talent + Research" ) %>%fmt_number(columns =c(Estimate, `Std. Error`, `t value`), decimals =3) %>%cols_label(Term ="Predictor",Estimate ="Estimate",`Std. Error`="Std. Error",`t value`="t value",`Pr(>|t|)`="Pr(>|t|)" ) %>%tab_footnote(footnote = residual_summary_mlr,locations =cells_title(groups ="title") ) %>%tab_source_note(source_note = fit_summary_mlr )
Regression Results: Total AI Score ~ Talent + Research1
Predictor
Estimate
Std. Error
t value
Pr(>|t|)
(Intercept)
23.915
0.548
43.642
< 2e-16 ***
Talent
4.208
0.943
4.464
3.69e-05 ***
Research
10.896
0.943
11.559
< 2e-16 ***
Residual SE = 4.315 on 59 DF; R² = 0.9213; Adj. R² = 0.9186; F(2, 59) = 345.2; p < 2.2e-16
1 Residuals: Min = -12.4453, 1Q = -2.7429, Median = 0.5836, 3Q = 3.1185, Max = 9.0231