ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("readxl","widyr","dplyr","stringr","tm","tidytext","ggplot2",
"topicmodels","tidyr","textdata","writexl","textclean","igraph",
"ggraph","tibble","widyr","DescTools","purrr","flextable","officer",
"ggraph", "psych", "rpart", "rpart.plot", "quanteda")
ipak(packages) setwd("C:/Users/Usuario/OneDrive - Universitat de Barcelona/0_RECERCA/Congresos/2025/ECCES/Final")
data <- read_excel("unico.xlsx")
data_cleaned <- data
data_cleaned <- data_cleaned %>%
rename(`EU Grant` = `EU Grant award in euros (This amount represents the grant awarded after the selection stage and is indicative. Please note that any changes made during or after the project's lifetime will not be reflected here.)`)
data_cleaned <- data_cleaned %>%
mutate(
`Key Action` = as.factor(`Key Action`),
`Action Type` = as.factor(`Action Type`),
`Funding Year` = as.factor(`Funding Year`),
`Coordinator's country` = as.factor(`Coordinator's country`),
`Is Good Practice` = as.factor(`Is Good Practice`),
Ongoing = as.factor(Ongoing),
`Topic_Model`=as.factor (`Topic_Model`), #added for this version
`EU Grant` = as.numeric(`EU Grant`)
)
summary(data_cleaned)## Key Action Action Type
## CIEGP:1546 European policy experimentation in higher education: 10
## PCEP :1611 HigherEd :1461
## SPR : 47 Policy_Exp : 37
## School :1696
##
##
##
## Funding Year Project Identifier Project Title Project Summary
## 2023 :722 Length:3204 Length:3204 Length:3204
## 2024 :593 Class :character Class :character Class :character
## 2022 :302 Mode :character Mode :character Mode :character
## 2014 :298
## 2020 :295
## 2019 :237
## (Other):757
## Topics Results Platform Project Card
## Length:3204 Length:3204
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Coordinating organisation name Coordinator's country Participating Countries
## Length:3204 ES : 282 Length:3204
## Class :character IT : 274 Class :character
## Mode :character PL : 258 Mode :character
## FR : 256
## DE : 242
## NL : 141
## (Other):1751
## EU Grant Is Good Practice Ongoing Topic_Model Ethics Category
## Min. : 0 No :1658 N:1593 Topic 1: 623 Length:3204
## 1st Qu.: 222953 Yes:1546 Y:1611 Topic 2: 574 Class :character
## Median : 250000 Topic 3:1022 Mode :character
## Mean : 291016 Topic 4: 582
## 3rd Qu.: 400000 Topic 5: 403
## Max. :2500000
##
## AI related project
## Length:3204
## Class :character
## Mode :character
##
##
##
##
Character or factor variables
Clean project summaries
data_cleaned <- data_cleaned %>%
mutate(summary_only_clean = `Project Summary` %>%
replace_non_ascii() %>%
str_to_lower() %>%
replace_contraction() %>%
replace_symbol() %>%
replace_number() %>%
str_replace_all("[[:punct:]]", " ") %>%
str_squish())Create tokens from text
custom_stopwords <- c(
stopwords("en"),
"project", "students", "education", "europe", "school", "higher", "one", "thousand",
"educational", "three", "improve", "eu", "based", "needs", "challenges",
"will", "use", "key", "need", "objectives", "countries", "four", "goal", "e",
"objective", "aims", "development", "s", "two", "aim", "develop",
"learning", "teachers", "main", "among"
)
tokens_clean <- tokens(data_cleaned$summary_only_clean,
what = "word",
remove_punct = TRUE,
remove_numbers = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = custom_stopwords)Create dfm (document-feature matrix)
Trim sparse terms (similar to Python’s min_df = 10, max_df = 0.9)
Convert to topicmodels format
Fit LDA model
Extract top 10 keywords per topic
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "digital" "european" "skills" "schools" "european"
## [2,] "european" "training" "european" "skills" "teaching"
## [3,] "social" "create" "digital" "digital" "innovative"
## [4,] "future" "awareness" "new" "new" "skills"
## [5,] "skills" "schools" "training" "innovative" "climate"
## [6,] "children" "promote" "social" "approach" "knowledge"
## [7,] "environmental" "skills" "competences" "knowledge" "practices"
## [8,] "increase" "social" "teaching" "children" "inclusive"
## [9,] "world" "teaching" "young" "pupils" "specific"
## [10,] "ai" "primary" "innovative" "design" "science"
Plot (dummy counts used — replace with real ones if available)
topic_counts <- data.frame(
Topic = factor(1:5),
Count = c(620, 570, 1020, 580, 400),
Keywords = c(
"digital\neuropean\nsocial\nfuture\nskills\nchildren\nenvironmental\nincrease\nworld\nAI",
"european\ntraining\ncreate\nawareness\nschools\npromote\nskills\nsocial\nteaching\nprimary",
"skills\neuropean\ndigital\nnew\ntraining\nsocial\ncompetences\nteaching\nyoung\ninnovative",
"schools\nskills\ndigital\nnew\ninnovative\napproach\nknowledge\nchildren\npupils\ndesign",
"european\nteaching\ninnovative\nskills\nclimate\nknowledge\npractices\ninclusive\nspecific\nscience"
)
)Optional: topic labels
topic_labels <- tibble(
topic = factor(1:5),
label = c(
"European Digital Future",
"European awareness",
"Youth Skills and Social Change",
"Early Education Digital Innovation",
"Climate Change, Inclusion & STEAM")
)Plot with keywords
custom_colors <- c("#e69f00", "#d55e00", "#cc79a7", "#56b4e9", "#009e73")
ggplot(topic_counts, aes(x = Keywords, y = Count, fill = Topic)) +
geom_bar(stat = "identity", show.legend = FALSE) +
geom_text(aes(label = Count),
vjust = 2.5,
size = 4) +
scale_fill_manual(values = custom_colors) +
labs(
title = "Distribution of Dominant Topics across Projects",
x = "Topics (Top Keywords)",
y = "Number of Projects"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(size = 10, angle = 0, hjust = 0.5, vjust = 0.5),
axis.text.y = element_text(size = 12),
plot.title = element_text(face = "bold", size = 16)
)Projects by country/year and ethics Create shortened year label
by_group <- data %>%
group_by(`Coordinator's country`, `Funding Year`, `Ethics Category`) %>%
summarise(Projects = n(), .groups = "drop") %>%
mutate(FundingYearShort = substr(as.character(`Funding Year`), 3, 4)) Plot using short year labels
ggplot(by_group, aes(x = FundingYearShort, y = Projects, fill = `Ethics Category`)) +
geom_col(position = "dodge") +
facet_wrap(~`Coordinator's country`) +
labs(
title = "Projects by Country, Year, and Ethics Category",
x = "Funding Year (last 2 digits)",
y = "Number of Projects") +
theme_minimal(base_size = 10)+
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold", size = 10, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1)
)ggplot(by_group, aes(x = `Ethics Category`, y = Projects, fill = `Ethics Category`)) +
geom_col(position = "dodge") +
facet_wrap(~`Coordinator's country`) +
labs(
title = "Projects by Country and Ethics Category",
x = "",
y = "Number of Projects") +
theme_minimal(base_size = 10)+
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold", size = 10, hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1)
)Count by country, year, AI and Ethics category
cross_tab <- data_cleaned %>%
count(`Coordinator's country`, `Funding Year`, `AI Category`, `Ethics Category`) %>%
rename(Count = n)Shorten year (last 2 digits)
Plot
ggplot(cross_tab, aes(x = `Funding Year`, y = Count, fill = `Ethics Category`)) +
geom_col(position = "dodge", width = 0.8) +
facet_wrap(~ `Coordinator's country`, scales = "free_y", ncol = 6) +
labs(
title = "Ethics Classification of AI Projects by Country and Year",
x = "Funding Year",
y = "Project Count",
fill = "Ethics Category"
) +
theme_minimal(base_size = 10) +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1, size = 5),
strip.text = element_text(size = 8),
plot.title = element_text(face = "bold", size = 14)
)Plot
ggplot(combine_df, aes(x = `Ethics Category`, y = n, fill = `AI Category`)) +
geom_col(position = "dodge") +
geom_text(aes(label = n),
position = position_dodge(width=0.9),
vjust = -0.5,
size = 4) +
labs(
title = "Projects by Ethics & AI Categories",
x = "Ethics Category",
y = "Number of Projects",
fill = "AI Category"
) +
theme_minimal(base_size = 14)+
theme(
axis.text.x = element_text(size = 10, angle = 0, hjust = 0.5, vjust = 0.5),
axis.text.y = element_text(size = 12),
plot.title = element_text(face = "bold", size = 16)
)Clean and simplify variable names
##
## AI Related Non AI Related
## Ethics & Equity Related 122 131
## Non Ethics & Equity Related 1396 1555
## [1] 0
data_cleaned$`Funding Year` <- as.numeric(data_cleaned$`Funding Year`)
data_cleaned$EthicsCategoryBinary <-
ifelse(data_cleaned$`Ethics Category` == "Ethics & Equity Related", 1, 0)
model_2 <- glm(EthicsCategoryBinary ~ `Funding Year` + `AI Category`, data = data_cleaned, family = binomial)
summary(model_2)##
## Call:
## glm(formula = EthicsCategoryBinary ~ `Funding Year` + `AI Category`,
## family = binomial, data = data_cleaned)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.30360 0.21855 -15.116 < 2e-16 ***
## `Funding Year` 0.10114 0.02204 4.588 4.47e-06 ***
## `AI Category`Non AI Related 0.14460 0.13683 1.057 0.291
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1770.1 on 3203 degrees of freedom
## Residual deviance: 1747.2 on 3201 degrees of freedom
## AIC: 1753.2
##
## Number of Fisher Scoring iterations: 5
model_3 <- glm(EthicsCategoryBinary ~ `Funding Year` + `AI Category` + `EU Grant` , data = data_cleaned, family = binomial)
summary(model_3)##
## Call:
## glm(formula = EthicsCategoryBinary ~ `Funding Year` + `AI Category` +
## `EU Grant`, family = binomial, data = data_cleaned)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.254e+00 2.419e-01 -13.454 < 2e-16 ***
## `Funding Year` 1.026e-01 2.230e-02 4.603 4.17e-06 ***
## `AI Category`Non AI Related 1.438e-01 1.369e-01 1.050 0.294
## `EU Grant` -2.076e-07 4.462e-07 -0.465 0.642
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1770.1 on 3203 degrees of freedom
## Residual deviance: 1747.0 on 3200 degrees of freedom
## AIC: 1755
##
## Number of Fisher Scoring iterations: 5