Packages used in R

ipak <- function(pkg){ 
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] 
  if (length(new.pkg))  
    install.packages(new.pkg, dependencies = TRUE) 
  sapply(pkg, require, character.only = TRUE) 
} 


packages <- c("readxl","widyr","dplyr","stringr","tm","tidytext","ggplot2",
              "topicmodels","tidyr","textdata","writexl","textclean","igraph",
              "ggraph","tibble","widyr","DescTools","purrr","flextable","officer",
              "ggraph", "psych", "rpart", "rpart.plot", "quanteda")

ipak(packages)  

Prepare data

setwd("C:/Users/Usuario/OneDrive - Universitat de Barcelona/0_RECERCA/Congresos/2025/ECCES/Final")
data <- read_excel("unico.xlsx")
data_cleaned <- data
data_cleaned <- data_cleaned %>%
  rename(`EU Grant` = `EU Grant award in euros (This amount represents the grant awarded after the selection stage and is indicative. Please note that any changes made during or after the project's lifetime will not be reflected here.)`)
data_cleaned <- data_cleaned %>%
  mutate(
    `Key Action` = as.factor(`Key Action`),
    `Action Type` = as.factor(`Action Type`),
    `Funding Year` = as.factor(`Funding Year`),
    `Coordinator's country` = as.factor(`Coordinator's country`),
    `Is Good Practice` = as.factor(`Is Good Practice`),
    Ongoing = as.factor(Ongoing),
    `Topic_Model`=as.factor (`Topic_Model`), #added for this version
    `EU Grant` = as.numeric(`EU Grant`)
  )
summary(data_cleaned)
##  Key Action                                                Action Type  
##  CIEGP:1546   European policy experimentation in higher education:  10  
##  PCEP :1611   HigherEd                                           :1461  
##  SPR  :  47   Policy_Exp                                         :  37  
##               School                                             :1696  
##                                                                         
##                                                                         
##                                                                         
##   Funding Year Project Identifier Project Title      Project Summary   
##  2023   :722   Length:3204        Length:3204        Length:3204       
##  2024   :593   Class :character   Class :character   Class :character  
##  2022   :302   Mode  :character   Mode  :character   Mode  :character  
##  2014   :298                                                           
##  2020   :295                                                           
##  2019   :237                                                           
##  (Other):757                                                           
##     Topics          Results Platform Project Card
##  Length:3204        Length:3204                  
##  Class :character   Class :character             
##  Mode  :character   Mode  :character             
##                                                  
##                                                  
##                                                  
##                                                  
##  Coordinating organisation name Coordinator's country Participating Countries
##  Length:3204                    ES     : 282          Length:3204            
##  Class :character               IT     : 274          Class :character       
##  Mode  :character               PL     : 258          Mode  :character       
##                                 FR     : 256                                 
##                                 DE     : 242                                 
##                                 NL     : 141                                 
##                                 (Other):1751                                 
##     EU Grant       Is Good Practice Ongoing   Topic_Model   Ethics Category   
##  Min.   :      0   No :1658         N:1593   Topic 1: 623   Length:3204       
##  1st Qu.: 222953   Yes:1546         Y:1611   Topic 2: 574   Class :character  
##  Median : 250000                             Topic 3:1022   Mode  :character  
##  Mean   : 291016                             Topic 4: 582                     
##  3rd Qu.: 400000                             Topic 5: 403                     
##  Max.   :2500000                                                              
##                                                                               
##  AI related project
##  Length:3204       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
DT::datatable(data_cleaned)

Character or factor variables

cat_vars <- data_cleaned %>%
  select(where(~ is.character(.) | is.factor(.)))

Topic Modelling on whole corpus

Clean project summaries

data_cleaned <- data_cleaned %>%
  mutate(summary_only_clean = `Project Summary` %>%
           replace_non_ascii() %>%
           str_to_lower() %>%
           replace_contraction() %>%
           replace_symbol() %>%
           replace_number() %>%
           str_replace_all("[[:punct:]]", " ") %>%
           str_squish())

Create tokens from text

custom_stopwords <- c(
  stopwords("en"),
  "project", "students", "education", "europe", "school", "higher", "one", "thousand", 
  "educational", "three", "improve", "eu", "based", "needs", "challenges",
  "will", "use", "key", "need", "objectives", "countries", "four", "goal", "e", 
  "objective", "aims", "development", "s", "two", "aim", "develop", 
  "learning", "teachers", "main", "among"
)

tokens_clean <- tokens(data_cleaned$summary_only_clean,
                       what = "word",
                       remove_punct = TRUE,
                       remove_numbers = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(pattern = custom_stopwords)

Create dfm (document-feature matrix)

dfm_clean <- dfm(tokens_clean)

Trim sparse terms (similar to Python’s min_df = 10, max_df = 0.9)

dfm_clean <- dfm_trim(dfm_clean, min_termfreq = 10, max_docfreq = 0.9, docfreq_type = "prop")

Convert to topicmodels format

dtm_topicmodels <- convert(dfm_clean, to = "topicmodels")

Fit LDA model

lda_model <- LDA(dtm_topicmodels, k = 5, control = list(seed = 42))

Extract top 10 keywords per topic

terms_matrix <- terms(lda_model, 10)
print(terms_matrix)
##       Topic 1         Topic 2     Topic 3       Topic 4      Topic 5     
##  [1,] "digital"       "european"  "skills"      "schools"    "european"  
##  [2,] "european"      "training"  "european"    "skills"     "teaching"  
##  [3,] "social"        "create"    "digital"     "digital"    "innovative"
##  [4,] "future"        "awareness" "new"         "new"        "skills"    
##  [5,] "skills"        "schools"   "training"    "innovative" "climate"   
##  [6,] "children"      "promote"   "social"      "approach"   "knowledge" 
##  [7,] "environmental" "skills"    "competences" "knowledge"  "practices" 
##  [8,] "increase"      "social"    "teaching"    "children"   "inclusive" 
##  [9,] "world"         "teaching"  "young"       "pupils"     "specific"  
## [10,] "ai"            "primary"   "innovative"  "design"     "science"

Plot (dummy counts used — replace with real ones if available)

topic_counts <- data.frame(
  Topic = factor(1:5),
  Count = c(620, 570, 1020, 580, 400),
  Keywords = c(
    "digital\neuropean\nsocial\nfuture\nskills\nchildren\nenvironmental\nincrease\nworld\nAI",
    "european\ntraining\ncreate\nawareness\nschools\npromote\nskills\nsocial\nteaching\nprimary",
    "skills\neuropean\ndigital\nnew\ntraining\nsocial\ncompetences\nteaching\nyoung\ninnovative",
    "schools\nskills\ndigital\nnew\ninnovative\napproach\nknowledge\nchildren\npupils\ndesign",
    "european\nteaching\ninnovative\nskills\nclimate\nknowledge\npractices\ninclusive\nspecific\nscience"
  )
)

Optional: topic labels

topic_labels <- tibble(
  topic = factor(1:5),
  label = c(
    "European Digital Future",
    "European awareness",
    "Youth Skills and Social Change",
    "Early Education Digital Innovation",
    "Climate Change, Inclusion & STEAM")
)

Plot with keywords

custom_colors <- c("#e69f00", "#d55e00", "#cc79a7", "#56b4e9", "#009e73")

ggplot(topic_counts, aes(x = Keywords, y = Count, fill = Topic)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = Count), 
            vjust = 2.5, 
            size = 4) +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Distribution of Dominant Topics across Projects",
    x = "Topics (Top Keywords)",
    y = "Number of Projects"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(size = 10, angle = 0, hjust = 0.5, vjust = 0.5),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(face = "bold", size = 16)
  )

AI & Ethics crosstabs [Country, Year]

Projects by country/year and ethics Create shortened year label

by_group <- data %>%
  group_by(`Coordinator's country`, `Funding Year`, `Ethics Category`) %>%
  summarise(Projects = n(), .groups = "drop") %>%
  mutate(FundingYearShort = substr(as.character(`Funding Year`), 3, 4)) 

Plot using short year labels

ggplot(by_group, aes(x = FundingYearShort, y = Projects, fill = `Ethics Category`)) +
  geom_col(position = "dodge") +
  facet_wrap(~`Coordinator's country`) +
  labs(
    title = "Projects by Country, Year, and Ethics Category",
    x = "Funding Year (last 2 digits)",
    y = "Number of Projects") +
  theme_minimal(base_size = 10)+
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 10, hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

ggplot(by_group, aes(x = `Ethics Category`, y = Projects, fill = `Ethics Category`)) +
  geom_col(position = "dodge") +
  facet_wrap(~`Coordinator's country`) +
  labs(
    title = "Projects by Country and Ethics Category",
    x = "",
    y = "Number of Projects") +
  theme_minimal(base_size = 10)+
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 10, hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Count by country, year, AI and Ethics category

cross_tab <- data_cleaned %>%
  count(`Coordinator's country`, `Funding Year`, `AI Category`, `Ethics Category`) %>%
  rename(Count = n)

Shorten year (last 2 digits)

cross_tab <- cross_tab %>%
  mutate(FundingYearShort = substr(as.character(`Funding Year`), 3, 4))

Plot

ggplot(cross_tab, aes(x = `Funding Year`, y = Count, fill = `Ethics Category`)) +
  geom_col(position = "dodge", width = 0.8) +
  facet_wrap(~ `Coordinator's country`, scales = "free_y", ncol = 6) +
    labs(
    title = "Ethics Classification of AI Projects by Country and Year",
    x = "Funding Year",
    y = "Project Count",
    fill = "Ethics Category"
  ) +
  theme_minimal(base_size = 10) +
  theme(legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1, size = 5),
    strip.text = element_text(size = 8),
    plot.title = element_text(face = "bold", size = 14)
  )

Some inferentials

Count projects by Ethics and AI category

combine_df <- data_cleaned %>%
  count(`Ethics Category`, `AI Category`)

Plot

ggplot(combine_df, aes(x = `Ethics Category`, y = n, fill = `AI Category`)) +
  geom_col(position = "dodge") + 
  geom_text(aes(label = n), 
            position = position_dodge(width=0.9),        
            vjust = -0.5, 
                    size = 4) +
  labs(
    title = "Projects by Ethics & AI Categories",
    x = "Ethics Category",
    y = "Number of Projects",
    fill = "AI Category"
  ) +
  theme_minimal(base_size = 14)+
  theme(
    axis.text.x = element_text(size = 10, angle = 0, hjust = 0.5, vjust = 0.5),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(face = "bold", size = 16)
  )

Clean and simplify variable names

data_prop <- data_cleaned %>%
  rename(
    Ethics = `Ethics Category`,
    AI = `AI Category`
  ) %>%
  filter(!is.na(Ethics), !is.na(AI))

Logistic regression to understand prediction between AI projects and ethical approach

table(data_cleaned$`Ethics Category`, data_cleaned$`AI Category`)
##                              
##                               AI Related Non AI Related
##   Ethics & Equity Related            122            131
##   Non Ethics & Equity Related       1396           1555
tab <- table(data_cleaned$`Ethics Category`, data_cleaned$`AI Category`)
phi(tab)
## [1] 0
data_cleaned$`Funding Year` <- as.numeric(data_cleaned$`Funding Year`)


data_cleaned$EthicsCategoryBinary <- 
  ifelse(data_cleaned$`Ethics Category` == "Ethics & Equity Related", 1, 0)

model_2 <- glm(EthicsCategoryBinary ~ `Funding Year` + `AI Category`, data = data_cleaned, family = binomial)
summary(model_2)
## 
## Call:
## glm(formula = EthicsCategoryBinary ~ `Funding Year` + `AI Category`, 
##     family = binomial, data = data_cleaned)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -3.30360    0.21855 -15.116  < 2e-16 ***
## `Funding Year`               0.10114    0.02204   4.588 4.47e-06 ***
## `AI Category`Non AI Related  0.14460    0.13683   1.057    0.291    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1770.1  on 3203  degrees of freedom
## Residual deviance: 1747.2  on 3201  degrees of freedom
## AIC: 1753.2
## 
## Number of Fisher Scoring iterations: 5
model_3 <- glm(EthicsCategoryBinary ~ `Funding Year` + `AI Category` + `EU Grant` , data = data_cleaned, family = binomial)
summary(model_3)
## 
## Call:
## glm(formula = EthicsCategoryBinary ~ `Funding Year` + `AI Category` + 
##     `EU Grant`, family = binomial, data = data_cleaned)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -3.254e+00  2.419e-01 -13.454  < 2e-16 ***
## `Funding Year`               1.026e-01  2.230e-02   4.603 4.17e-06 ***
## `AI Category`Non AI Related  1.438e-01  1.369e-01   1.050    0.294    
## `EU Grant`                  -2.076e-07  4.462e-07  -0.465    0.642    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1770.1  on 3203  degrees of freedom
## Residual deviance: 1747.0  on 3200  degrees of freedom
## AIC: 1755
## 
## Number of Fisher Scoring iterations: 5