### Macro level
= FALSE # takes quite long runLPA
Analyze CAM data on micro, meso, macro level for part II of basal attributes article
1 global variables
2 Notes
In graph theory and network analysis, the terms “micro,” “meso” and “macro” refer to different scales or levels of analysis within a network. Each of these levels focuses on different types of elements and interactions within the network, allowing for a varied understanding of the structure and dynamics of the network as a whole.
3 load data files
# sets the directory of location of this script as the current directory
# setwd(dirname(rstudioapi::getSourceEditorContext()$path))
### load packages
require(pacman)
p_load('tidyverse', 'jsonlite', 'magrittr', 'xlsx',
'stargazer', 'psych', 'jtools', 'DT', 'ggstatsplot',
'lavaan',
'regsem', 'MplusAutomation', 'igraph', 'shiny', 'ggplot2', 'tidyLPA', 'MultilayerExtraction',
'Matrix', 'igraph', 'foreach', 'doParallel', 'parallel', 'R.matlab',
'reticulate',
'RColorBrewer')
# To install MultilayerExtraction (https://github.com/jdwilson4/MultilayerExtraction):
# install.packages("devtools")
# library(devtools, quietly = TRUE)
#
# #install and load MultilayerExtraction
# devtools::install_github('jdwilson4/multilayer_extraction')
#
library(MultilayerExtraction, quietly = TRUE)
# reticulate::py_config()
# reticulate::py_module_available("pycairo")
# reticulate::py_module_available("cairocffi")
if(!reticulate::py_module_available("igraph")){
py_install("igraph") # pip install python-igraph
}if(!reticulate::py_module_available("modularitypruning")){
py_install("modularitypruning") # pip install modularitypruning
}if(!reticulate::py_module_available("matplotlib")){
py_install("matplotlib")
}if(!reticulate::py_module_available("scipy")){
py_install("scipy")
}if(!reticulate::py_module_available("leidenalg")){
py_install("leidenalg")
}if(!reticulate::py_module_available("os")){
py_install("os")
}if(!reticulate::py_module_available("seed")){
py_install("seed")
}if(!reticulate::py_module_available("numpy")){
py_install("numpy")
}## install Python modules
# py_install("matplotlib")
# py_install("scipy")
# py_install("cairocffi")
# py_install("pycairo") # pip install pycairo
setwd("outputs/01_dataPreperation/final")
### load questionnaire
<- readRDS(file = "questionnaire.rds")
questionnaire
<- readRDS(file = "CAMfiles.rds")
CAMfiles
<- readRDS(file = "CAMdrawn.rds")
CAMdrawn
<- readRDS(file = "CAMaggregated.rds")
CAMaggregated
<- readRDS(file = "networkIndicators.rds")
networkIndicators
<- xlsx::read.xlsx2(file = "CAMwordlist.xlsx", sheetIndex = 1)
CAMwordlist $mean_valence <- as.numeric(CAMwordlist$mean_valence)
CAMwordlist$mean_degree <- as.numeric(CAMwordlist$mean_degree)
CAMwordlist# CAMwordlist$mean_transitivity <- as.numeric(CAMwordlist$mean_transitivity)
### load functions
# print(getwd())
setwd("../../../functions")
for(i in 1:length(dir())){
# print(dir()[i])
source(dir()[i], encoding = "utf-8")
}
setwd("../functions_CAMapp")
for(i in 1:length(dir())){
# print(dir()[i])
source(dir()[i], encoding = "utf-8")
}rm(i)
### summary function
<- function(data, varname, groupnames){
data_summary require(plyr)
<- function(x, col){
summary_func c(mean = mean(x[[col]], na.rm=TRUE),
se = sd(x[[col]], na.rm=TRUE) / sqrt(length(x[[col]])))
}<-ddply(data, groupnames, .fun=summary_func,
data_sum
varname)<- plyr::rename(data_sum, c("mean" = varname))
data_sum return(data_sum)
}
4 Describe sample
table(questionnaire$socio_sex)
Female Male Prefer not to say
53 114 2
round(x = table(questionnaire$socio_sex) / nrow(questionnaire) * 100, digits = 2)
Female Male Prefer not to say
31.36 67.46 1.18
::describe(x = questionnaire[, c("socio_age", "total_min_prolific")]) psych
vars n mean sd median trimmed mad min max range
socio_age 1 169 30.34 9.17 28.00 29.22 8.90 18.00 71.00 53.00
total_min_prolific 2 169 35.39 13.16 33.43 33.93 10.25 12.17 91.88 79.72
skew kurtosis se
socio_age 1.30 2.25 0.71
total_min_prolific 1.41 3.09 1.01
5 Micro level
- Micro-level analysis focuses on the smallest units of analysis in a network — the individual nodes and the edges between them.
5.1 Distributions (valence, degree, local clustering coefficient)
Check distributions of
- valence
- degree
- local clustering coefficient (transitivity)
<- CAMfiles[[1]]
valenceData $value[valenceData$value == 10] <- 0
valenceData$text <-
valenceDatastr_replace_all(string = valenceData$text,
pattern = " ",
replacement = " ")
<- sort(unique(valenceData$text))
tmp_names length(tmp_names)
[1] 33
<- tmp_names[1:16]
hist1_names <- tmp_names[17:length(tmp_names)]
hist2_names
# adjust aesthetics of ggplot
<- theme(
ggplot_theme axis.title.x = element_blank(),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(
size = 10,
hjust = 0.5,
vjust = 0.5,
face = "plain",
colour = "black"
),axis.text.y = element_text(
size = 12,
face = "plain",
colour = "black"
),panel.border = element_blank(),
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank()
)
### plot average valence
#> histograms sorted by mean valence
<-
hist1_names_mean order(CAMwordlist$mean_valence, decreasing = TRUE), "Words"][1:16]
CAMwordlist[<-
hist2_names_mean order(CAMwordlist$mean_valence, decreasing = TRUE), "Words"][17:33]
CAMwordlist[
%>% filter(text %in% hist1_names_mean) %>%
valenceData ggplot(aes(x = value)) + geom_histogram() +
facet_wrap( ~ factor(text, levels = hist1_names_mean),
ncol = 4,
nrow = 4) +
ggtitle("Sorted by avg. valence (first)") + ggplot_theme
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
%>% filter(text %in% hist2_names_mean) %>%
valenceData ggplot(aes(x = value)) + geom_histogram() +
facet_wrap( ~ factor(text, levels = hist2_names_mean),
ncol = 4,
nrow = 5) +
ggtitle("Sorted by avg. valence (second)") + ggplot_theme
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### create data for mean degree, transitivity
$degree <- NA
valenceData$transitivity <- NA
valenceData
for (i in unique(valenceData$participantCAM)) {
<- igraph::degree(graph = CAMdrawn[[i]], mode = "total")
tmp_deg <-
tmp_transitivity ::transitivity(graph = as.undirected(CAMdrawn[[i]]),
igraphtype = "local")
= 1
counter
for (n in V(CAMdrawn[[i]])$name) {
$participantCAM %in% i &
valenceData[valenceData$id == n, "degree"] <- tmp_deg[counter]
valenceData$participantCAM %in% i &
valenceData[valenceData$id == n, "transitivity"] <-
valenceData
tmp_transitivity[counter]= counter + 1
counter
}
}
### plot average degree
#> histograms sorted by mean degree
<-
hist1_names_degree order(CAMwordlist$mean_degree, decreasing = TRUE), "Words"][1:16]
CAMwordlist[<-
hist2_names_degree order(CAMwordlist$mean_degree, decreasing = TRUE), "Words"][17:33]
CAMwordlist[
%>% filter(text %in% hist1_names_degree) %>%
valenceData ggplot(aes(x = degree)) + geom_histogram() +
facet_wrap( ~ factor(text, levels = hist1_names_degree),
ncol = 4,
nrow = 4) +
ggtitle("Sorted by avg. degree (first)") + ggplot_theme
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
%>% filter(text %in% hist2_names_degree) %>%
valenceData ggplot(aes(x = degree)) + geom_histogram() +
facet_wrap( ~ factor(text, levels = hist2_names_degree),
ncol = 4,
nrow = 5) +
ggtitle("Sorted by avg. degree (second)") + ggplot_theme
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### plot average transitivity
#> histograms sorted by mean transitivity
#> ! interpret carefully
$mean_transitivity <- NA
CAMwordlist
for(i in 1:nrow(CAMwordlist)){
$mean_transitivity[i] <- mean(x = valenceData$transitivity[valenceData$text == CAMwordlist$Words[i]], na.rm = TRUE)
CAMwordlist
}
<-
hist1_names_transitivity order(CAMwordlist$mean_transitivity, decreasing = TRUE), "Words"][1:16]
CAMwordlist[<-
hist2_names_transitivity order(CAMwordlist$mean_transitivity, decreasing = TRUE), "Words"][17:33]
CAMwordlist[
%>% filter(text %in% hist1_names_transitivity) %>%
valenceData ggplot(aes(x = transitivity)) + geom_histogram() +
facet_wrap( ~ factor(text, levels = hist1_names_transitivity),
ncol = 4,
nrow = 4) +
ggtitle("Sorted by avg. local clustering coefficient (first)") + ggplot_theme
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 1443 rows containing non-finite outside the scale range
(`stat_bin()`).
%>% filter(text %in% hist2_names_transitivity) %>%
valenceData ggplot(aes(x = transitivity)) + geom_histogram() +
facet_wrap( ~ factor(text, levels = hist2_names_transitivity),
ncol = 4,
nrow = 5) +
ggtitle("Sorted by avg. local clustering coefficient (second)") + ggplot_theme
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 970 rows containing non-finite outside the scale range
(`stat_bin()`).
5.2 check bimodal pattern of local clustering coefficient
<- valenceData$participantCAM[valenceData$text == "Insekten ähnlich" & valenceData$transitivity == 0 & !is.na(valenceData$transitivity)]
tmp_transitivity_0 <- valenceData$participantCAM[valenceData$text == "Insekten ähnlich" & valenceData$transitivity == 1 & !is.na(valenceData$transitivity)]
tmp_transitivity_1
plot(CAMdrawn[[tmp_transitivity_0[1]]], edge.arrow.size = .7,
layout=layout_nicely, vertex.frame.color="black", asp = .5, margin = -0.1,
vertex.size = 10, vertex.label.cex = .9)
plot(CAMdrawn[[tmp_transitivity_1[1]]], edge.arrow.size = .7,
layout=layout_nicely, vertex.frame.color="black", asp = .5, margin = -0.1,
vertex.size = 10, vertex.label.cex = .9)
5.3 correlation of distributions micro level
cor.plot(r = cor(valenceData[, c("value", "degree", "transitivity")], use = "pairwise"))
6 Meso level
- Meso-level analysis bridges the micro and macro by focusing on groups and communities within the network.
6.1 Leiden algorithm
<- reticulate::py_module_available("leidenalg") && reticulate::py_module_available("igraph")
modules modules
[1] TRUE
<- CAMaggregated[[1]]
adjacency_matrix
diag(adjacency_matrix) <- 0
<-
graph_object graph_from_adjacency_matrix(adjacency_matrix, mode = "undirected")
<- leiden::leiden(object = graph_object, resolution_parameter = 1.37)
partition
for (i in sort(unique(partition))) {
cat(
"\nfor partion",
i,"the following words have been found:\n",
rownames(adjacency_matrix)[partition == i],
"\n"
) }
for partion 1 the following words have been found:
aktive Formänderung durch Umwelteinwirkung reaktionsfähig passive Formänderung durch Umwelteinwirkung autonom passive Verhaltensänderung durch Umwelteinwirkung aktive Verhaltensänderung durch Umwelteinwirkung intelligent multifunktional technologisch
for partion 2 the following words have been found:
zuverlässig wartungsfrei selbstheilend widerstandsfähig selbstreparierend haltbar robust langlebig
for partion 3 the following words have been found:
Akzeptanz eines neuen Materialsystems wartungsintensiv enthält Kunststoff leicht zerstörbar umweltschädlich
for partion 4 the following words have been found:
Energie speichernd energieeffizient energieautonom Energie generierend
for partion 5 the following words have been found:
ökologisch elektronikfrei nachhaltig umweltfreundlich
for partion 6 the following words have been found:
Insekten ähnlich bioinspiriert lebensähnlich
<- brewer.pal(max(c(3, partition)), "Pastel1")[partition]
node.cols
### colour aggregated CAM
= CAMaggregated[[2]]
g = simplify(CAMaggregated[[2]])
g2 # plot(g2, edge.arrow.size=0.01,
# vertex.size=diag(CAMaggregated[[1]]) / max(diag(CAMaggregated[[1]]))*20)
E(g2)$weight = sapply(E(g2), function(e) {
length(all_shortest_paths(g, from=ends(g2, e)[1], to=ends(g2, e)[2])$res) } )
E(g2)$weight = E(g2)$weight * 2
for(i in 1:5){
plot(g2, edge.arrow.size = 0,
layout=layout_nicely, vertex.frame.color="black", asp = .4, margin = .1,
vertex.size=diag(CAMaggregated[[1]]) / max(diag(CAMaggregated[[1]]))*5,
vertex.label.cex = .9,
edge.weight=2, edge.width=(E(g2)$weight/35), vertex.color = node.cols)
}
6.1.1 save final parition solution
setwd("outputs/02_analyses_CAMs/Leiden algorithm aggregated CAM")
= 1
h for (i in sort(unique(partition))) {
if(h == 1){
<- data.frame(parition = i, words = rownames(adjacency_matrix)[partition == i])
dat_out else{
}<- rbind(dat_out,
dat_out data.frame(parition = i, words = rownames(adjacency_matrix)[partition == i]))
}
= h + 1
h
}
$mean_valence <- NA
dat_out$sd_valence <- NA
dat_outfor(i in 1:nrow(dat_out)){
$mean_valence[i] <- CAMwordlist$mean_valence[CAMwordlist$Words == dat_out$words[i]]
dat_out$sd_valence[i] <- CAMwordlist$sd_valence[CAMwordlist$Words == dat_out$words[i]]
dat_out
}
$words <- str_replace_all(string = dat_out$words, pattern = " {2,6}", replacement = " ")
dat_out
$sd_valence <- as.numeric(dat_out$sd_valence)
dat_out$mean_valence <- round(x = dat_out$mean_valence, digits = 2)
dat_out
## save as .xlsx file
::write.xlsx2(x = dat_out, file = "LeidenAlgorithm_solution.xlsx", row.names = FALSE)
xlsx## save as R object
saveRDS(dat_out, file = "LeidenAlgorithm_solution.rds")
## add means for partitions
<- dat_out %>%
partitions_means group_by(parition) %>%
summarise(N = n(), mean = mean (mean_valence), SD = mean(sd_valence))
$sd_valence <- NULL
dat_out$group_mean <- NA
dat_out
$mean <- round(x = partitions_means$mean, digits = 2)
partitions_means$SD <- round(x = partitions_means$SD, digits = 2)
partitions_means
for(i in 1:nrow(partitions_means)){
<- dat_out$parition == i
tmp_boolean <- which(tmp_boolean)[length(which(tmp_boolean))]
tmp_boolean $group_mean[tmp_boolean] <- paste0(partitions_means$mean[i], " (", partitions_means$SD[i], ")")
dat_out
}
stargazer(dat_out, type = "html", summary = FALSE, rownames = FALSE, out = "tableAppendixD.html")
<table style="text-align:center"><tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">parition</td><td>words</td><td>mean_valence</td><td>group_mean</td></tr>
<tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">1</td><td>aktive Formänderung durch Umwelteinwirkung</td><td>0.180</td><td></td></tr>
<tr><td style="text-align:left">1</td><td>reaktionsfähig</td><td>0.670</td><td></td></tr>
<tr><td style="text-align:left">1</td><td>passive Formänderung durch Umwelteinwirkung</td><td>-0.020</td><td></td></tr>
<tr><td style="text-align:left">1</td><td>autonom</td><td>0.750</td><td></td></tr>
<tr><td style="text-align:left">1</td><td>passive Verhaltensänderung durch Umwelteinwirkung</td><td>0.060</td><td></td></tr>
<tr><td style="text-align:left">1</td><td>aktive Verhaltensänderung durch Umwelteinwirkung</td><td>0.270</td><td></td></tr>
<tr><td style="text-align:left">1</td><td>intelligent</td><td>1.350</td><td></td></tr>
<tr><td style="text-align:left">1</td><td>multifunktional</td><td>1.650</td><td></td></tr>
<tr><td style="text-align:left">1</td><td>technologisch</td><td>0.760</td><td>0.63 (1.07)</td></tr>
<tr><td style="text-align:left">2</td><td>zuverlässig</td><td>2.170</td><td></td></tr>
<tr><td style="text-align:left">2</td><td>wartungsfrei</td><td>1.750</td><td></td></tr>
<tr><td style="text-align:left">2</td><td>selbstheilend</td><td>1.650</td><td></td></tr>
<tr><td style="text-align:left">2</td><td>widerstandsfähig</td><td>1.740</td><td></td></tr>
<tr><td style="text-align:left">2</td><td>selbstreparierend</td><td>1.790</td><td></td></tr>
<tr><td style="text-align:left">2</td><td>haltbar</td><td>1.930</td><td></td></tr>
<tr><td style="text-align:left">2</td><td>robust</td><td>1.820</td><td></td></tr>
<tr><td style="text-align:left">2</td><td>langlebig</td><td>2.220</td><td>1.88 (1.07)</td></tr>
<tr><td style="text-align:left">3</td><td>Akzeptanz eines neuen Materialsystems</td><td>0.360</td><td></td></tr>
<tr><td style="text-align:left">3</td><td>wartungsintensiv</td><td>-2.040</td><td></td></tr>
<tr><td style="text-align:left">3</td><td>enthält Kunststoff</td><td>-1.040</td><td></td></tr>
<tr><td style="text-align:left">3</td><td>leicht zerstörbar</td><td>-1.800</td><td></td></tr>
<tr><td style="text-align:left">3</td><td>umweltschädlich</td><td>-2.600</td><td>-1.42 (1.06)</td></tr>
<tr><td style="text-align:left">4</td><td>Energie speichernd</td><td>1.500</td><td></td></tr>
<tr><td style="text-align:left">4</td><td>energieeffizient</td><td>2.080</td><td></td></tr>
<tr><td style="text-align:left">4</td><td>energieautonom</td><td>1.430</td><td></td></tr>
<tr><td style="text-align:left">4</td><td>Energie generierend</td><td>1.720</td><td>1.68 (1.06)</td></tr>
<tr><td style="text-align:left">5</td><td>ökologisch</td><td>1.900</td><td></td></tr>
<tr><td style="text-align:left">5</td><td>elektronikfrei</td><td>0.460</td><td></td></tr>
<tr><td style="text-align:left">5</td><td>nachhaltig</td><td>2.380</td><td></td></tr>
<tr><td style="text-align:left">5</td><td>umweltfreundlich</td><td>2.490</td><td>1.81 (1.01)</td></tr>
<tr><td style="text-align:left">6</td><td>Insekten ähnlich</td><td>-0.120</td><td></td></tr>
<tr><td style="text-align:left">6</td><td>bioinspiriert</td><td>0.780</td><td></td></tr>
<tr><td style="text-align:left">6</td><td>lebensähnlich</td><td>0.330</td><td>0.33 (1.03)</td></tr>
<tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr></table>
stargazer(dat_out, type = "latex", summary = FALSE, rownames = FALSE)
% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Mi, Mai 22, 2024 - 11:25:18
\begin{table}[!htbp] \centering
\caption{}
\label{}
\begin{tabular}{@{\extracolsep{5pt}} cccc}
\\[-1.8ex]\hline
\hline \\[-1.8ex]
parition & words & mean\_valence & group\_mean \\
\hline \\[-1.8ex]
$1$ & aktive Formänderung durch Umwelteinwirkung & $0.180$ & \\
$1$ & reaktionsfähig & $0.670$ & \\
$1$ & passive Formänderung durch Umwelteinwirkung & $$-$0.020$ & \\
$1$ & autonom & $0.750$ & \\
$1$ & passive Verhaltensänderung durch Umwelteinwirkung & $0.060$ & \\
$1$ & aktive Verhaltensänderung durch Umwelteinwirkung & $0.270$ & \\
$1$ & intelligent & $1.350$ & \\
$1$ & multifunktional & $1.650$ & \\
$1$ & technologisch & $0.760$ & 0.63 (1.07) \\
$2$ & zuverlässig & $2.170$ & \\
$2$ & wartungsfrei & $1.750$ & \\
$2$ & selbstheilend & $1.650$ & \\
$2$ & widerstandsfähig & $1.740$ & \\
$2$ & selbstreparierend & $1.790$ & \\
$2$ & haltbar & $1.930$ & \\
$2$ & robust & $1.820$ & \\
$2$ & langlebig & $2.220$ & 1.88 (1.07) \\
$3$ & Akzeptanz eines neuen Materialsystems & $0.360$ & \\
$3$ & wartungsintensiv & $$-$2.040$ & \\
$3$ & enthält Kunststoff & $$-$1.040$ & \\
$3$ & leicht zerstörbar & $$-$1.800$ & \\
$3$ & umweltschädlich & $$-$2.600$ & -1.42 (1.06) \\
$4$ & Energie speichernd & $1.500$ & \\
$4$ & energieeffizient & $2.080$ & \\
$4$ & energieautonom & $1.430$ & \\
$4$ & Energie generierend & $1.720$ & 1.68 (1.06) \\
$5$ & ökologisch & $1.900$ & \\
$5$ & elektronikfrei & $0.460$ & \\
$5$ & nachhaltig & $2.380$ & \\
$5$ & umweltfreundlich & $2.490$ & 1.81 (1.01) \\
$6$ & Insekten ähnlich & $$-$0.120$ & \\
$6$ & bioinspiriert & $0.780$ & \\
$6$ & lebensähnlich & $0.330$ & 0.33 (1.03) \\
\hline \\[-1.8ex]
\end{tabular}
\end{table}
Compute all means for all possible tuples of the identified partitions:
setwd("outputs/02_analyses_CAMs/Leiden algorithm aggregated CAM")
# Generate all combinations of 2 elements
<- combn(partitions_means$parition, 2, simplify = FALSE)
combinations
for(i in 1:length(combinations)){
<- data.frame(partitionA = combinations[[i]][1],
tmp partitionB = combinations[[i]][2],
mean = mean(partitions_means$mean[combinations[[i]]]),
wordsA = paste0(dat_out$words[dat_out$parition == combinations[[i]][1]], collapse = " \\ "),
wordsB = paste0(dat_out$words[dat_out$parition == combinations[[i]][2]], collapse = " \\ "))
if(i == 1){
<- tmp
dat_out_hypotheses else{
}<- rbind(dat_out_hypotheses, tmp)
dat_out_hypotheses
}
}
<- dat_out_hypotheses[order(dat_out_hypotheses$mean),]
dat_out_hypotheses
## save as .xlsx file
::write.xlsx2(x = dat_out_hypotheses, file = "LeidenAlgorithm_hypotheses.xlsx", row.names = FALSE)
xlsx## save as R object
saveRDS(dat_out_hypotheses, file = "LeidenAlgorithm_hypotheses.rds")
6.1.2 Leiden algorithm on aggregated multi-edge graph with ModularityPruning (Gibson and Mucha, 2022)
Community detection based on the single-layer multi-edge aggregated graph. Partitions are then pruned with ModularityPruning (http://github.com/ragibson/ModularityPruning) to keep only stable and modularity-optimal solutions.
import numpy as np
import igraph as ig
from modularitypruning import prune_to_stable_partitions
from modularitypruning.leiden_utilities import repeated_parallel_leiden_from_gammas
from modularitypruning.champ_utilities import CHAMP_3D
from modularitypruning.parameter_estimation_utilities import domains_to_gamma_omega_estimates
from modularitypruning.plotting import plot_2d_domains_with_estimates
from modularitypruning.plotting import plot_estimates
from modularitypruning.champ_utilities import CHAMP_2D
from modularitypruning.parameter_estimation_utilities import ranges_to_gamma_estimates
from modularitypruning.plotting import plot_estimates
import matplotlib.pyplot as plt
import os
from random import seed
from python_helpers import print_partition # our local helper scripts
=np.inf)
np.set_printoptions(threshold
with open("outputs/01_dataPreperation/final/CAMaggregated_adj_matrix.csv", "r") as f:
= f.readline()
names
= names.split('" "')
names 0] = names[0][1:] # strip the first "
names[-1] = names[-1][:-2] # strip the last \n
names[
= dict(enumerate(names))
names
= np.loadtxt(open("outputs/01_dataPreperation/final/CAMaggregated_adj_matrix.csv", "rb"), delimiter=" ", skiprows=1)
adj_matrix
# remove self-loops
0)
np.fill_diagonal(adj_matrix,
= ig.Graph.Adjacency(matrix=adj_matrix, mode="undirected")
G
= (0, 2)
gamma_range = np.linspace(*gamma_range, 10 ** 1) # 5 # 100k runs of Leiden algorithm
leiden_gammas
12345)
seed(= repeated_parallel_leiden_from_gammas(G, leiden_gammas)
partitions
# prune to the stable partitions from (gamma=0, omega=0) to (gamma=2, omega=2)
12345)
seed(= prune_to_stable_partitions(G, partitions, *gamma_range)
stable_parts
# run CHAMP to obtain the dominant partitions along with their regions of optimality
= CHAMP_2D(G, partitions, gamma_0=0.0, gamma_f=2.0)
ranges
# append gamma estimate for each dominant partition onto the CHAMP domains
= ranges_to_gamma_estimates(G, ranges)
gamma_estimates
# get some infos about our partitions
for stable_part in stable_parts:
print_partition(stable_part, partitions, names, gamma_estimates)print("\n\n")
# plot gamma estimates and domains of optimality
'text', usetex=True)
plt.rc('font', family='serif')
plt.rc(
plot_estimates(gamma_estimates)r"CHAMP Domains of Optimality and $\gamma$ Estimates", fontsize=14)
plt.title(r"$\gamma$", fontsize=14)
plt.xlabel("Number of communities", fontsize=14)
plt.ylabel( plt.show()
6.1.3 Leiden algorithm on multilayer graph with ModularityPruning (Gibson and Mucha, 2022)
Community detection based on a multilayer graph with participants as layers. Partitions are then pruned with ModularityPruning (http://github.com/ragibson/ModularityPruning) to keep only stable and modularity-optimal solutions.
import numpy as np
import igraph as ig
from modularitypruning import prune_to_multilayer_stable_partitions
from modularitypruning.leiden_utilities import repeated_parallel_leiden_from_gammas_omegas
from modularitypruning.champ_utilities import CHAMP_3D
from modularitypruning.parameter_estimation_utilities import domains_to_gamma_omega_estimates
from modularitypruning.plotting import plot_2d_domains_with_estimates
import matplotlib.pyplot as plt
from random import seed
from python_helpers import print_partition # our local helper scripts
import cairocffi
import scipy.io
=np.inf)
np.set_printoptions(threshold
# CAMaggregated = scipy.io.loadmat("outputs/01_dataPreperation/final/CAMaggregated_adj_matrices.mat") # !!!
= scipy.io.loadmat("outputs/01_dataPreperation/final/CAMaggregated_adj_matrices_onlyOnes.mat") # !!!
CAMaggregated
# import names for labelling
with open("outputs/01_dataPreperation/final/CAMaggregated_adj_matrix.csv", "r") as f:
= f.readline()
names
= names.split('" "')
names 0] = names[0][1:] # strip the first "
names[-1] = names[-1][:-2] # strip the last \n
names[
= dict(enumerate(names))
names
= list(CAMaggregated["multigraph_adj_matrices_list"][0,0])
adj_matrices
= len(adj_matrices)
num_layers = 33
n_per_layer
# nodes 0..32 are layer0
# nodes 33..65 are layer1
# ...
# layer_vec holds the layer membership of each node
# e.g. layer_vec[5] = 2 means that node 5 resides in layer 2 (the third layer)
= [i // n_per_layer for i in range(n_per_layer * num_layers)]
layer_vec = [(n_per_layer * layer + v, n_per_layer * layer + v + n_per_layer)
interlayer_edges for layer in range(num_layers - 1)
for v in range(n_per_layer)]
# intralayer edges: we need a list of tuples (i.e. edgelist)
# recode the node indices according to the scheme described above (33..65 is layer1 etc.).
# note that this is unweighted for now (could add weights to the igraph object)
= []
intralayer_edges for i, adj_matrix in enumerate(adj_matrices):
= np.where(adj_matrix)
conn_indices = conn_indices
x_indices, y_indices += i * n_per_layer
x_indices += i * n_per_layer
y_indices = zip(*(x_indices, y_indices))
edges += edges
intralayer_edges
= ig.Graph(interlayer_edges)
G_interlayer = ig.Graph(intralayer_edges)
G_intralayer # remove the double edges we got from constructing the igraph objects with symmetric matrices
= G_intralayer.simplify()
G_intralayer
# run Leiden on a uniform 32x32 grid (1024 samples) of gamma and omega in [0, 2]
= (0, 2)
gamma_range = (0, 2)
omega_range = np.linspace(*gamma_range, 32)
leiden_gammas = np.linspace(*omega_range, 32)
leiden_omegas
12345)
seed(= repeated_parallel_leiden_from_gammas_omegas(G_intralayer, G_interlayer, layer_vec, gammas=leiden_gammas, omegas=leiden_omegas)
parts
# prune to the stable partitions from (gamma=0, omega=0) to (gamma=2, omega=2)
12345)
seed(= prune_to_multilayer_stable_partitions(G_intralayer, G_interlayer, layer_vec,
stable_parts "multiplex", parts,
*gamma_range, *omega_range)
# run CHAMP to obtain the dominant partitions along with their regions of optimality
12345)
seed(= CHAMP_3D(G_intralayer, G_interlayer, layer_vec, parts,
domains =gamma_range[0], gamma_f=gamma_range[1],
gamma_0=omega_range[0], omega_f=omega_range[1])
omega_0
# append resolution parameter estimates for each dominant partition onto the CHAMP domains
12345)
seed(= domains_to_gamma_omega_estimates(G_intralayer, G_interlayer, layer_vec,
domains_with_estimates ='multiplex')
domains, modelfor partition in stable_parts:
type="multilayer")
print_partition(partition, parts, names, domains_with_estimates,
# plot resolution parameter estimates and domains of optimality
'text', usetex=True)
plt.rc('font', family='serif')
plt.rc(=omega_range, ylim=gamma_range)
plot_2d_domains_with_estimates(domains_with_estimates, xlimr"CHAMP Domains and ($\omega$, $\gamma$) Estimates", fontsize=16)
plt.title(r"$\omega$", fontsize=20)
plt.xlabel(r"$\gamma$", fontsize=20)
plt.ylabel(='both', labelsize=12)
plt.gca().tick_params(axis
plt.tight_layout() plt.show()
6.1.4 MultilayerExtraction algorithm (Wilson et al., 2017)
Community detection based on a multilayer graph with participants as layers using the MultilayerExtraction by Wilson et al., 2017 (https://github.com/jdwilson4/MultilayerExtraction).
setwd("outputs/02_analyses_CAMs/MultilayerExtraction")
# recode vertex labels to integers
<- c(hist1_names, hist2_names)
attributes_names
# there were differences in whitespace between some of the concepts (2 vs. 3 whitespaces) -> fix it
4]][["node1"]][CAMaggregated[[4]][["node1"]] == "Energie generierend"] <- "Energie generierend"
CAMaggregated[[4]][["node2"]][CAMaggregated[[4]][["node2"]] == "Energie generierend"] <- "Energie generierend"
CAMaggregated[[4]][["node1"]][CAMaggregated[[4]][["node1"]] == "Energie speichernd"] <- "Energie speichernd"
CAMaggregated[[4]][["node2"]][CAMaggregated[[4]][["node2"]] == "Energie speichernd"] <- "Energie speichernd"
CAMaggregated[[
4]][["node1"]] <- match(CAMaggregated[[4]][["node1"]], attributes_names)
CAMaggregated[[4]][["node2"]] <- match(CAMaggregated[[4]][["node2"]], attributes_names)
CAMaggregated[[
# recode layers (CAM IDs) to integers
<- unique(CAMfiles[[1]]$participantCAM)
layer_names 4]][["layer"]] <- match(CAMaggregated[[4]][["layer"]], layer_names)
CAMaggregated[[
# now we're ready for community extraction using the MultiLayer extraction package
set.seed(12345)
# uncomment to run the algorithm (took around 3 hours on our machine)
# start_time <- Sys.time()
# multilayer_extraction_results <- multilayer.extraction(adjacency = CAMaggregated[[4]], seed = 12345, min.score = 0, prop.sample = .10)
# end_time <- Sys.time()
# end_time - start_time
# saveRDS(multilayer_extraction_results, file = "multilayer_extraction_results.Rda")
<- readRDS(file = "multilayer_extraction_results.Rda")
multilayer_extraction_results
plot(multilayer_extraction_results, main = "Diagnostic Plot")
# look at the first ... 20 communities
<- data.frame(n_communities = seq(1:20))
multilayer_extraction_communities paste0("community", 1:20)] <- NA
multilayer_extraction_communities[
for(i in 1:20) {
<- refine(multilayer_extraction_results, k = i, m = 192, n = nrow(CAMwordlist))
object
for(j in 1:i) {
+1] <- paste(attributes_names[which(object$Vertices[,j] == 1)], collapse = ", ")
multilayer_extraction_communities[i,j
}
}
::datatable(multilayer_extraction_communities, options = list(pageLength = 5)) DT
7 Macro level
- Macro-level analysis looks at the network as a whole, focusing on large-scale patterns and structures.
7.1 plot aggregated CAM
= CAMaggregated[[2]]
g = simplify(CAMaggregated[[2]])
g2 # plot(g2, edge.arrow.size=0.01,
# vertex.size=diag(CAMaggregated[[1]]) / max(diag(CAMaggregated[[1]]))*20)
E(g2)$weight = sapply(E(g2), function(e) {
length(all_shortest_paths(g, from=ends(g2, e)[1], to=ends(g2, e)[2])$res) } )
E(g2)$weight = E(g2)$weight * 2
# E(g2)$weight[E(g2)$weight == 1] <- NA
V(g2)$color[V(g2)$value <= .5 & V(g2)$value >= -.5] <- "yellow"
V(g2)$shape <- NA
V(g2)$shape <- ifelse(test = V(g2)$color == "yellow", yes = "square", no = "circle")
### > plot multiple times because of random layout
for(i in 1:3){
plot(g2, edge.arrow.size = 0,
layout=layout_nicely, vertex.frame.color="black", asp = .5, margin = -0.1,
vertex.size=diag(CAMaggregated[[1]]) / max(diag(CAMaggregated[[1]]))*5,
vertex.label.cex = .9,
edge.weight=2, edge.width=(E(g2)$weight/45))
}
7.2 hierachical cluster analysis
<- data.frame(CAM = unique(CAMfiles[[1]]$CAM), participantCAM = unique(CAMfiles[[1]]$participantCAM))
hc_dat
### create data set
## word vars
for(w in unique(CAMfiles[[1]]$text)){
<- str_remove_all(string = str_to_title(string = w, locale = "en"), pattern = " |\\W+")
varName_w
paste0("N_", varName_w)]] <- NA
hc_dat[[paste0("mean_", varName_w)]] <- NA
hc_dat[[paste0("SD_", varName_w)]] <- NA
hc_dat[[
}
## get N, mean, sd of single summarized concepts
= FALSE
verbose
for(c in unique(CAMfiles[[1]]$CAM)){
if(verbose){
cat("considered CAM: ", c, "\n")
}<- CAMfiles[[1]][CAMfiles[[1]]$CAM == c, ]
tmp_CAM_nodes $value <- ifelse(test = tmp_CAM_nodes$value == 10, yes = 0, no = tmp_CAM_nodes$value)
tmp_CAM_nodes
for(w in unique(CAMfiles[[1]]$text)){
if(verbose){
cat("considered concept: ", w, "\n")
}
<- str_remove_all(string = str_to_title(string = w, locale = "en"), pattern = " |\\W+")
varName_w if(verbose){
cat(" > the freqeuncy, mean, SD are saved with the prefix N_, mean_, SD_ plus
word without white spaces: ", varName_w, "\n")
}
if(sum(tmp_CAM_nodes$text == w) > 0){
<- tmp_CAM_nodes[tmp_CAM_nodes$text == w, ]
tmp_CAM_nodes_w
## add N
$CAM == c, paste0("N_", varName_w)] <- nrow(tmp_CAM_nodes_w)
hc_dat[hc_dat## add mean
$CAM == c, paste0("mean_", varName_w)] <- mean(x = tmp_CAM_nodes_w$value)
hc_dat[hc_dat## add SD, only if > 1
$CAM == c, paste0("SD_", varName_w)] <- sd(x = tmp_CAM_nodes_w$value)
hc_dat[hc_dat
}
}if(verbose){
cat("\n")
}
}
### compute clustering
<- hc_dat[, str_subset(string = colnames(hc_dat), pattern = "mean_")]
hc_df <- hc_df[, colSums(x = !is.na(hc_df)) >= 2] # only considers concepts which where drawn at least 2 times
hc_df
<- scale(hc_df)
hc_df_scaled
<- dist(hc_df_scaled, method = "euclidean")
dist.eucl
<- hclust(dist.eucl, method = "ward.D2") # Ward's method
hc_cluster hc_cluster
Call:
hclust(d = dist.eucl, method = "ward.D2")
Cluster method : ward.D2
Distance : euclidean
Number of objects: 169
<- 25
set_cutOff
plot(hc_cluster)
abline(h = set_cutOff, col = "red", lty = 2)
rect.hclust(hc_cluster, h=set_cutOff, border = "tomato")
<- cutree(hc_cluster, h=set_cutOff)
groups # groupsOut <- names(table(groups))[table(groups) >= 2]
table(groups)
groups
1 2 3
40 106 23
aggregate(hc_df, by=list(cluster=groups), mean, na.rm = TRUE)
cluster mean_AkzeptanzEinesNeuenMaterialsystems mean_EnergieSpeichernd
1 1 0.2500000 2.0000000
2 2 0.4339623 1.5094340
3 3 0.1739130 0.5652174
mean_Wartungsintensiv mean_AktiveFormänderungDurchUmwelteinwirkung
1 -1.9750000 0.9250000
2 -2.3018868 -0.1132075
3 -0.9565217 0.2173913
mean_Energieeffizient mean_EnthältKunststoff mean_Ökologisch
1 2.575000 -1.1500000 2.300000
2 2.094340 -1.1226415 1.981132
3 1.130435 -0.4782609 0.826087
mean_Reaktionsfähig mean_Elektronikfrei mean_Zuverlässig mean_Nachhaltig
1 1.3500000 0.9000000 2.650000 2.775000
2 0.4433962 0.3584906 2.245283 2.424528
3 0.5217391 0.1304348 1.000000 1.521739
mean_PassiveFormänderungDurchUmwelteinwirkung mean_Wartungsfrei
1 0.62500000 2.350000
2 -0.28301887 1.679245
3 0.08695652 1.043478
mean_Umweltfreundlich mean_Selbstheilend mean_Widerstandsfähig
1 2.825000 2.4750000 2.3500000
2 2.613208 1.5471698 1.7358491
3 1.347826 0.6956522 0.6956522
mean_Energieautonom mean_InsektenÄhnlich mean_Selbstreparierend mean_Autonom
1 1.9500000 0.45000000 2.5000000 1.2500000
2 1.3679245 -0.33962264 1.7735849 0.6509434
3 0.7826087 -0.08695652 0.6086957 0.3478261
mean_Bioinspiriert mean_PassiveVerhaltensänderungDurchUmwelteinwirkung
1 1.6000000 0.6500000
2 0.5377358 -0.2264151
3 0.4782609 0.3478261
mean_LeichtZerstörbar mean_AktiveVerhaltensänderungDurchUmwelteinwirkung
1 -2.150000 0.95000000
2 -1.820755 0.01886792
3 -1.086957 0.26086957
mean_Intelligent mean_Lebensähnlich mean_Haltbar mean_EnergieGenerierend
1 1.8750000 0.82500000 2.5500000 2.275000
2 1.2452830 0.18867925 1.9811321 1.622642
3 0.9130435 0.08695652 0.6086957 1.173913
mean_Umweltschädlich mean_Robust mean_Langlebig mean_Multifunktional
1 -2.900000 2.2500000 2.800000 2.200000
2 -2.754717 1.8584906 2.198113 1.547170
3 -1.391304 0.9130435 1.304348 1.173913
mean_Technologisch
1 0.9500000
2 0.7358491
3 0.5652174
7.3 Latent profile analysis
if(runLPA){
<-
valenceDataShort %>% pivot_wider(names_from = text,
valenceData values_from = value,
id_cols = CAM) # equal to hc_df
<- valenceDataShort %>% select(-CAM) %>%
lpa estimate_profiles(1:9, variances = c("equal", "equal", "varying"),
covariances = c("zero", "equal", "zero"))
}