Data preperation part II of basal attributes article

Author

Julius Fenn, Paul Sölder

1 Notes

2 global variables

Define your global variables (can take some time to run):

save_CAMs_as_pictures = TRUE # TRUE = save CAMs as .json, .png files
plot_deleted_CAMs = TRUE

3 create raw data files

# sets the directory of location of this script as the current directory
# setwd(dirname(rstudioapi::getSourceEditorContext()$path))

### load packages
# devtools::install_github('jdwilson4/multilayer_extraction')
require(pacman)
p_load('tidyverse', 'jsonlite', 'magrittr', 'xlsx',
       'stargazer', 'psych', 'jtools', 'DT', 'ggstatsplot',
       'lavaan',
       'regsem', 'MplusAutomation', 'igraph', 'ggplot2', 'tidyLPA', 'MultilayerExtraction',
       'Matrix', 'igraph', 'foreach', 'doParallel', 'parallel',
       'R.matlab')


### load socio-demographic data
setwd("data demographic")
prolific <- read.csv(file = "prolific_export_65d4744b1373145896174eba.csv", header = TRUE)

setwd("../data final list basal attributes")
listBasalAttributes <- read.xlsx2(file = "final list basal attributes.xlsx", sheetIndex = 1)
listBasalAttributes$German.Translation[listBasalAttributes$German.Translation == "bio-inspiriert"] <- "bioinspiriert"
listBasalAttributes$German.Translation[listBasalAttributes$German.Translation == "Energie generierend"] <- "Energie   generierend"
listBasalAttributes$German.Translation[listBasalAttributes$German.Translation == "Energie speichernd"] <- "Energie   speichernd"

### list data files
setwd("../data")
folders <- list.files(pattern = "^study_result.*")

### create data files
### get CAM data
writeLines("", "CAMdata.txt") # create file
text_connection <- file("CAMdata.txt", "a") # open connection to append

### get pre CAM data
writeLines("", "preCAM.txt") # create file
text_connection_pre <- file("preCAM.txt", "a") # open connection to append

### get post CAM data
writeLines("", "postCAM.txt") # create file
text_connection_post <- file("postCAM.txt", "a") # open connection to append

for(i in 1:length(folders)){
  setwd(folders[i])
  if(length(dir()) == 3){
    # print(i)
    ### CAM data
    setwd(dir()[2])
    tmp <- jsonlite::fromJSON(txt = "data.txt")
    writeLines(jsonlite::toJSON(x = tmp), text_connection)
    setwd("..")

    ### pre CAM data
    setwd(dir()[1])
    tmp <- jsonlite::fromJSON(txt = "data.txt")
    writeLines(jsonlite::toJSON(x = tmp), text_connection_pre)
    setwd("..")

    ### post CAM data
    setwd(dir()[3])
    tmp <- jsonlite::fromJSON(txt = "data.txt")
    writeLines(jsonlite::toJSON(x = tmp), text_connection_post)
    setwd("..")
  }
  setwd("..")
}

close(text_connection) # close connection CAM
close(text_connection_pre) # close connection
close(text_connection_post) # close connection

### move files to output folder
# copy files (not overwritten)
tmp_file_from <-  getwd()
setwd("../outputs/01_dataPreperation")
file.copy(from =  paste0(tmp_file_from, "/CAMdata.txt"), to = paste0(getwd(), "/CAMdata.txt"))
[1] FALSE
file.copy(from =  paste0(tmp_file_from, "/preCAM.txt"), to = paste0(getwd(), "/preCAM.txt"))
[1] FALSE
file.copy(from =  paste0(tmp_file_from, "/postCAM.txt"), to = paste0(getwd(), "/postCAM.txt"))
[1] FALSE
### remove files
file.remove(paste0(tmp_file_from, "/CAMdata.txt"))
[1] TRUE
file.remove(paste0(tmp_file_from, "/preCAM.txt"))
[1] TRUE
file.remove(paste0(tmp_file_from, "/postCAM.txt"))
[1] TRUE
### load functions
setwd("../../functions")
for(i in 1:length(dir())){
  # print(dir()[i])
  source(dir()[i], encoding = "utf-8")
}


setwd("../functions_CAMapp")
for(i in 1:length(dir())){
  # print(dir()[i])
  source(dir()[i], encoding = "utf-8")
}
rm(i)



### summary function
data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      se = sd(x[[col]], na.rm=TRUE) / sqrt(length(x[[col]])))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- plyr::rename(data_sum, c("mean" = varname))
  return(data_sum)
}

4 set up data.frame questionnaires

setwd("outputs/01_dataPreperation")
# > pre study
suppressMessages(
  read_file('preCAM.txt') %>%
    # ... split it into lines ...
    str_split('\n') %>% first() %>%
    # ... filter empty rows ...
    discard(function(x)
      x == '') %>%
    discard(function(x)
      x == '\r') %>%
    # ... parse JSON into a data.frame
    map_dfr(fromJSON, flatten = TRUE)
) -> dat_preCAM
# > post CAM
read_file('postCAM.txt') %>%
  # ... split it into lines ...
  str_split('\n') %>% first() %>%
  # ... filter empty rows ...
  discard(function(x)
    x == '') %>%
  discard(function(x)
    x == '\r') -> tmp_postCAM


tmp_out <- list()
for (i in 1:length(tmp_postCAM)) {
  tmp_out[[i]] <- fromJSON(tmp_postCAM[i])
  tmp_out[[i]]$"21" <- NULL # causes error
}

dat_postCAM <- dplyr::bind_rows(tmp_out)
rm(tmp_postCAM)
rm(tmp_out)



### create counter variable for both data sets
# > pre study
dat_preCAM$ID <- NA

tmp_IDcounter <- 0
for (i in 1:nrow(dat_preCAM)) {
  if (!is.na(dat_preCAM$sender[i]) &&
      dat_preCAM$sender[i] == "Greetings") {
    # tmp <- dat_preCAM$prolific_pid[i]
    tmp_IDcounter = tmp_IDcounter + 1
  }
  dat_preCAM$ID[i] <- tmp_IDcounter
}

# > post CAM
dat_postCAM$ID <- NA

tmp_IDcounter <- 0
for (i in 1:nrow(dat_postCAM)) {
  if (!is.na(dat_postCAM$sender[i]) &&
      dat_postCAM$sender[i] == "CAMfeedbackGeneral") {
    # tmp <- dat_postCAM$prolific_pid[i]
    tmp_IDcounter = tmp_IDcounter + 1
  }
  dat_postCAM$ID[i] <- tmp_IDcounter
}



### keep only complete data sets
# > pre study
# sort(table(dat_preCAM$ID))
sum(table(dat_preCAM$ID) != max(table(dat_preCAM$ID)))
[1] 0
sum(table(dat_preCAM$ID) == max(table(dat_preCAM$ID)))
[1] 193
dat_preCAM <-
  dat_preCAM[dat_preCAM$ID %in% names(table(dat_preCAM$ID))[table(dat_preCAM$ID) == max(table(dat_preCAM$ID))], ]

# > post CAM
# sort(table(dat_postCAM$ID))
sum(table(dat_postCAM$ID) != max(table(dat_postCAM$ID)))
[1] 1
sum(table(dat_postCAM$ID) == max(table(dat_postCAM$ID)))
[1] 192
dat_postCAM <-
  dat_postCAM[dat_postCAM$ID %in% names(table(dat_postCAM$ID))[table(dat_postCAM$ID) == max(table(dat_postCAM$ID))], ]



### json (from JATOS) to 2D data.frame
# > pre study
# add paradata
tmp_notNumeric <-
  str_subset(string = colnames(dat_preCAM), pattern = "^meta|^R")
tmp_notNumeric <-
  str_subset(string = tmp_notNumeric,
             pattern = "labjs|location",
             negate = TRUE)

### get survey
vec_ques <- c("PROLIFIC_PID",
              "dummy_informedconsent",
              "commCheck",
              tmp_notNumeric)

vec_notNumeric = c("PROLIFIC_PID", tmp_notNumeric)

questionnaire_preCAM <- questionnairetype(
  dataset = dat_preCAM,
  listvars = vec_ques,
  notNumeric = vec_notNumeric,
  verbose = FALSE
)

dim(questionnaire_preCAM)
[1] 193  17
# > post CAM
tmp_numeric <-
  str_subset(string = colnames(dat_postCAM), pattern = "^nfc")


vec_ques <- c(
  "PROLIFIC_PID",
  "commCheck",
  "feedCAM_repres",
  "feedCAM_technicalprobs",
  "feedCAM_technicalprobsText",
  "feedCAM_already",
  "feedCAM_alreadyText",
  "openQuestion_MaterialSystem",
  "openQuestion_Ethic",
  "openQuestion_missedPositive",
  "openQuestion_missedNegative",
  "outcome_research",
  "outcome_prohibition",
  "outcome_buy",
  "outcome_buy_text",
  "education",
  "experienceMS",
  tmp_numeric,
  "feedback_critic"
)



vec_notNumeric = c(
  "PROLIFIC_PID",
  "feedCAM_technicalprobsText",
  "feedCAM_alreadyText",
  "openQuestion_MaterialSystem",
  "openQuestion_Ethic",
  "openQuestion_missedPositive",
  "openQuestion_missedNegative",
  "outcome_research",
  "outcome_prohibition",
  "outcome_buy",
  "outcome_buy_text",
  "education",
  "experienceMS",
  "feedback_critic"
)

questionnaire_postCAM <- questionnairetype(
  dataset = dat_postCAM,
  listvars = vec_ques,
  notNumeric = vec_notNumeric,
  verbose = FALSE
)

questionnaire_postCAM$feedback_critic[questionnaire_postCAM$feedback_critic == ""] <-
  NA
dim(questionnaire_postCAM)
[1] 192  23
questionnaire_postCAM$multipleEthic <- NA
tmp_multipleEthic <-
  cbind(dat_postCAM$ID, dat_postCAM$multipleEthic)

questionnaire_postCAM$multiplePositive <- NA
tmp_multiplePositive <-
  cbind(dat_postCAM$ID, dat_postCAM$multiplePositive)

questionnaire_postCAM$multipleNegative <- NA
tmp_multipleNegative <-
  cbind(dat_postCAM$ID, dat_postCAM$multipleNegative)


for (i in questionnaire_postCAM$ID) {
  # add multipleEthic
  tmp <- tmp_multipleEthic[tmp_multipleEthic[, 1] == i,]
  if (length(unlist(tmp[, 2])) == 3) {
    questionnaire_postCAM$multipleEthic[questionnaire_postCAM$ID == i] <-
      paste0(unlist(tmp[, 2]), collapse = " \\ ")
  }

  # add multiplePositive
  tmp <- tmp_multiplePositive[tmp_multiplePositive[, 1] == i,]
  if (length(unlist(tmp[, 2])) == 3) {
    questionnaire_postCAM$multiplePositive[questionnaire_postCAM$ID == i] <-
      paste0(unlist(tmp[, 2]), collapse = " \\ ")
  }

  # add multipleNegative
  tmp <- tmp_multipleNegative[tmp_multipleNegative[, 1] == i,]
  if (length(unlist(tmp[, 2])) == 3) {
    questionnaire_postCAM$multipleNegative[questionnaire_postCAM$ID == i] <-
      paste0(unlist(tmp[, 2]), collapse = " \\ ")
  }
}


dim(questionnaire_postCAM)
[1] 192  26
### merge all data sets
# remove one missing
questionnaire_preCAM <-
  questionnaire_preCAM[questionnaire_preCAM$PROLIFIC_PID %in% questionnaire_postCAM$PROLIFIC_PID,]


questionnaire_postCAM$ID <- NULL
questionnaire <-
  left_join(questionnaire_preCAM, questionnaire_postCAM, by = "PROLIFIC_PID")


## all missing answers to NA
questionnaire[questionnaire == ""] <- NA
## all feedback smaller than 3 characters to NA
questionnaire$feedback_critic[nchar(questionnaire$feedback_critic) <= 2] <-
  NA

dim(questionnaire)
[1] 192  41
### save files
## save as .xlsx file
xlsx::write.xlsx2(x = questionnaire, file = "questionnaire.xlsx")
## save as R object
saveRDS(questionnaire, file = "questionnaire.rds")

5 get reaction times for single components

Plot time taken (in minutes) by participants for single components of study:

dat_duration <-
  data.frame(
    duration = NA,
    sender = NA,
    ID = NA,
    PROLIFIC_PID = NA
  )



for (i in 1:length(unique(dat_preCAM$ID))) {
  tmp_PID <-
    dat_preCAM$PROLIFIC_PID[dat_preCAM$ID ==  unique(dat_preCAM$ID)[i] &
                              !is.na(dat_preCAM$PROLIFIC_PID)]



  # pre CAM
  tmp_preCAM <-
    data.frame(duration = dat_preCAM$duration[dat_preCAM$ID == unique(dat_preCAM$ID)[i]] / 1000,
               sender = dat_preCAM$sender[dat_preCAM$ID == unique(dat_preCAM$ID)[i]])
  tmp_preCAM <- tmp_preCAM[!is.na(tmp_preCAM$sender), ]

  # post CAM
  tmp_postCAM <-
    data.frame(duration = dat_postCAM$duration[dat_postCAM$ID == unique(dat_postCAM$ID)[i]] / 1000,
               sender = dat_postCAM$sender[dat_postCAM$ID == unique(dat_postCAM$ID)[i]])
  tmp_postCAM <- tmp_postCAM[!is.na(tmp_postCAM$sender), ]


  tmp <- rbind(tmp_preCAM, tmp_postCAM)


  if (all(is.na(dat_duration))) {
    dat_duration <- data.frame(
      duration = tmp$duration,
      sender = tmp$sender,
      ID = rep(i, times = nrow(tmp)),
      PROLIFIC_PID = rep(tmp_PID, times = nrow(tmp))
    )


  } else{
    dat_duration <-
      rbind(
        dat_duration,
        data.frame(
          duration = tmp$duration,
          sender = tmp$sender,
          ID = rep(i, times =
                     nrow(tmp)),
          PROLIFIC_PID = rep(tmp_PID, times =
                               nrow(tmp))
        )
      )
  }
}

## remove empty sender
dat_duration <- dat_duration[!is.na(dat_duration$sender),]
dat_duration <- dat_duration[!is.na(dat_duration$duration),]

dat_duration$sender[dat_duration$sender == "done"] <-
  "CAM instructions"

## save as .xlsx
# write.xlsx2(x = dat_duration, file = "outputs/para_duration_singleComponents.xlsx")

#### plot
dat_duration$ID <- factor(dat_duration$ID)
p <- dat_duration %>%
  ggplot(aes(x = sender, y = duration, color = PROLIFIC_PID)) +
  geom_point() +
  geom_jitter(width = 0.15) +
  theme(axis.text.x = element_text(angle = 90)) + theme(legend.position =
                                                          "none")
p

## save ggplot as PDF
ggsave(filename = "outputs/01_dataPreperation/durations_components.pdf", p)


# Calculate the mean duration in seconds for each sender and sort by mean duration
tmp <- dat_duration %>%
  group_by(sender) %>%
  summarise(N = n(),
            mean_duration = mean(duration, na.rm = TRUE)) %>%
  arrange(desc(mean_duration))
DT::datatable(tmp, options = list(pageLength = 5))

6 add socio-demographic data

### add prolific data
prolific <- prolific[prolific$Participant.id %in% questionnaire$PROLIFIC_PID,]
prolific <- prolific %>%
  arrange(sapply(Participant.id, function(y) which(y == questionnaire$PROLIFIC_PID)))




if(all(prolific$Participant.id == questionnaire$PROLIFIC_PID)){
  print("prolific data sucessfully added")

  questionnaire$socio_age <- prolific$Age
  questionnaire$socio_sex <- prolific$Sex
  questionnaire$socio_ethnicity <- prolific$Ethnicity.simplified
  questionnaire$socio_student <- prolific$Student.status
  questionnaire$socio_employment <- prolific$Employment.status
  questionnaire$total_min_prolific <- prolific$Time.taken / 60
  ## all time outs to NA
  # questionnaire$total_min_prolific[questionnaire$total_min_prolific > 1000] <- NA
  ## all expired data to NA
  questionnaire[questionnaire == "DATA_EXPIRED"] <- NA

  questionnaire$socio_age <- as.numeric(questionnaire$socio_age)
}
[1] "prolific data sucessfully added"
## all missing answers to NA
# questionnaire[questionnaire == ""] <- NA
## all feedback smaller than 3 characters to NA
# questionnaire$feedback_critic[nchar(questionnaire$feedback_critic) <= 2] <- NA

6.1 Describe sample

table(questionnaire$socio_sex)

           Female              Male Prefer not to say 
               63               127                 2 
round(x = table(questionnaire$socio_sex) / nrow(questionnaire) * 100, digits = 2)

           Female              Male Prefer not to say 
            32.81             66.15              1.04 
psych::describe(x = questionnaire[, c("socio_age", "total_min_prolific")])
                   vars   n  mean    sd median trimmed   mad   min   max range
socio_age             1 192 30.49  9.12  28.00   29.44  8.90 18.00 71.00 53.00
total_min_prolific    2 192 34.91 13.25  32.48   33.37 10.72 12.17 91.88 79.72
                   skew kurtosis   se
socio_age          1.19     1.91 0.66
total_min_prolific 1.38     2.80 0.96

7 set up CAM data

Load CAM data

setwd("outputs/01_dataPreperation")
suppressMessages(read_file("CAMdata.txt") %>%
  # ... split it into lines ...
  str_split('\n') %>% first() %>%
    discard(function(x) x == '') %>%
    discard(function(x) x == '\r') %>%
  # ... filter empty rows ...ds
  discard(function(x) x == '')) -> dat_CAM_pre

raw_CAM <- list()
for(i in 1:length(dat_CAM_pre)){
  raw_CAM[[i]] <- jsonlite::fromJSON(txt = dat_CAM_pre[[i]])
  # raw_CAM[[i]]$connectors$isBidirectional <- TRUE
}

Create CAM files, draw CAMs and compute network indicators

### create CAM single files (nodes, connectors, merged)
CAMfiles <- create_CAMfiles(datCAM = raw_CAM, reDeleted = TRUE)
Nodes and connectors, which were deleted by participants were removed. 
 # deleted nodes:  0 
 # deleted connectors:  434
nrow(CAMfiles[[1]])
[1] 6369
nrow(CAMfiles[[2]])
[1] 7880
nrow(CAMfiles[[3]])
[1] 15760
# remove testing data sets
CAMfiles[[1]] <- CAMfiles[[1]][CAMfiles[[1]]$participantCAM %in% questionnaire$PROLIFIC_PID,]
CAMfiles[[2]] <- CAMfiles[[2]][CAMfiles[[2]]$participantCAM %in% questionnaire$PROLIFIC_PID,]
CAMfiles[[3]] <- CAMfiles[[3]][CAMfiles[[3]]$participantCAM.x %in% questionnaire$PROLIFIC_PID,]

### draw CAMs
CAMdrawn <- draw_CAM(dat_merged = CAMfiles[[3]],
                     dat_nodes = CAMfiles[[1]],ids_CAMs = "all",
                     plot_CAM = FALSE,
                     useCoordinates = TRUE,
                     relvertexsize = 3,
                     reledgesize = 1)
processing 192 CAMs... 
Warning: `graph.data.frame()` was deprecated in igraph 2.0.0.
ℹ Please use `graph_from_data_frame()` instead.
[1] "== participantCAM in drawnCAM"
i <- 2
edge.attributes(graph = CAMdrawn[[i]])
$width
 [1] 5 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 5 1 1 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1
[39] 1

$weight
 [1] 3 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1
[39] 1

$color
 [1] "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey"
[11] "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey"
[21] "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey"
[31] "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey" "grey"

$lty
 [1] 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[39] 1
igraph::is_directed(graph = CAMdrawn[[i]])
[1] FALSE
plot(CAMdrawn[[i]], edge.arrow.size = .2,
       layout=layout_nicely, vertex.frame.color="black", asp = .5, margin = -0.1,
       vertex.size = 10, vertex.label.cex = .9)

### network indicators
tmp_microIndicator <- V(CAMdrawn[[1]])$label
networkIndicators <- compute_indicatorsCAM(drawn_CAM = CAMdrawn,
                                           micro_degree = tmp_microIndicator,
                                           micro_valence = tmp_microIndicator,
                                           micro_centr_clo = tmp_microIndicator,
                                           micro_transitivity = NULL, # not informative
                                           largestClique = FALSE)
Warning: `graph.density()` was deprecated in igraph 2.0.0.
ℹ Please use `edge_density()` instead.
Warning: The `types1` argument of `assortativity()` is deprecated as of igraph 1.6.0.
ℹ Please use the `values` argument instead.
Warning: `assortativity.degree()` was deprecated in igraph 2.0.0.
ℹ Please use `assortativity_degree()` instead.
### wordlist
CAMwordlist <- create_wordlist(
  dat_nodes =  CAMfiles[[1]],
  dat_merged =  CAMfiles[[3]],
  useSummarized = TRUE,
  order = "frequency",
  splitByValence = FALSE,
  comments = TRUE,
  raterSubsetWords = NULL,
  rater = FALSE
)
[1] "create_wordlist - use raw words"
[1] 0
[1] 6336
[1] "temporarily suffixes are added, because not all words have been summarized"
processing 192 CAMs... 
[1] "== participantCAM in drawnCAM"
DT::datatable(CAMwordlist, options = list(pageLength = 5))

7.1 plot CAMs using igraph package

Just plot first 10 CAMs for an impression

for(i in 1:10){ # length(CAMdrawn)
  plot(CAMdrawn[[i]], edge.arrow.size = .7,
       layout=layout_nicely, vertex.frame.color="black", asp = .5, margin = -0.1,
       vertex.size = 10, vertex.label.cex = .9)
}

7.2 save CAMs as .json files, and as .png (igraph)

if(save_CAMs_as_pictures) {
  raw_CAM_translated <- raw_CAM


  setwd("outputs/01_dataPreperation")

  setwd("savedCAMs")
  setwd("png")
  ### remove all files if there are any
  if (length(list.files()) >= 1) {
    file.remove(list.files())
    cat('\n!
      all former .png files have been deleted')
  }

  ### if no participant ID was provided replace by randomly generated CAM ID
  if (all(CAMfiles[[3]]$participantCAM.x == "noID")) {
    CAMfiles[[3]]$participantCAM.x <- CAMfiles[[3]]$CAM.x
  }

  ### save as .json files, and as .png (igraph)
  ids_CAMs <- unique(CAMfiles[[3]]$participantCAM.x)
  length(ids_CAMs)


  for (i in 1:length(ids_CAMs)) {
    save_graphic(filename = paste0("CAM", ids_CAMs[i])) #  paste0(ids_CAMs[i]))
    CAM_igraph <- CAMdrawn[[c(1:length(CAMdrawn))[names(CAMdrawn) == paste0(unique(CAMfiles[[3]]$participantCAM.x)[i])]]]

    ### translate:
    V(CAM_igraph)$label[match(listBasalAttributes$German.Translation,
                              V(CAM_igraph)$label)] <- listBasalAttributes$Basal.Attribute
    V(CAM_igraph)$label[V(CAM_igraph)$label == "Akzeptanz eines neuen Materialsystems"] <-
      "Acceptance of a new material system"

    plot(
      CAM_igraph,
      edge.arrow.size = .7,
      layout = layout_nicely,
      vertex.frame.color = "black",
      asp = .5,
      margin = -0.1,
      vertex.size = 10,
      vertex.label.cex = .9
    )
    dev.off()
  }

  setwd("../json")
  ### remove all files if there are any
  if (length(list.files()) >= 1) {
    file.remove(list.files())
    cat('\n!
      all former .json files have been deleted')
  }
  for (i in 1:length(raw_CAM_translated)) {
    if (!is_empty(raw_CAM_translated[[i]]$nodes)) {
      if (nrow(raw_CAM_translated[[i]]$nodes) > 5) {
        ### translate:
        raw_CAM_translated[[i]]$nodes$text[match(listBasalAttributes$German.Translation,
                                                 raw_CAM_translated[[i]]$nodes$text)] <-
          listBasalAttributes$Basal.Attribute
        raw_CAM_translated[[i]]$nodes$text[raw_CAM_translated[[i]]$nodes$text == "Akzeptanz eines neuen Materialsystems"] <-
          "Acceptance of a new material system"

        if (raw_CAM_translated[[i]]$creator %in% questionnaire$PROLIFIC_PID) {
          # only if not deleted previously
          write(
            toJSON(raw_CAM_translated[[i]], encoding = "UTF-8"),
            paste0(raw_CAM_translated[[i]]$creator, ".json")
          )
        }
      }
    }
  }
}

!
      all former .png files have been deleted

!
      all former .json files have been deleted

7.3 delete CAMs with star topology

### 10% of participants have connected all basal attributes to the central one
barplot(
  table(
    networkIndicators$degreetot_micro_AkzeptanzeinesneuenMaterialsystems
  ) / nrow(networkIndicators)
)

### delete extreme CAMs without any inter-connections between basal attributes
vec_delete <- c()
h = 1


for (i in 1:nrow(networkIndicators)) {
  if (networkIndicators$degreetot_micro_AkzeptanzeinesneuenMaterialsystems[i] >= 30) {
    # & networkIndicators$transitivity_macro[i] == 0
    vec_delete[h] <- names(CAMdrawn)[i]
    h = h + 1

    if(plot_deleted_CAMs){
          plot(
      CAMdrawn[[i]],
      edge.arrow.size = .7,
      layout = layout_nicely,
      vertex.frame.color = "black",
      asp = .5,
      margin = -0.1,
      vertex.size = 10,
      vertex.label.cex = .9,
      main = networkIndicators$participantCAM[i]
    )
    }
  }
}

if (length(vec_delete) > 1) {
  cat(length(vec_delete), "CAMs are removed from the data set, which results in",
      round(x = (1-(length(CAMdrawn) - length(vec_delete)) / length(CAMdrawn)) * 100, digits = 2),
      "% loss of data")


  ## remove deleted CAMs from
  #> CAM files
  CAMfiles_clean <- CAMfiles
  CAMfiles_clean[[1]] <-
    CAMfiles_clean[[1]][!CAMfiles_clean[[1]]$participantCAM %in% vec_delete,]
  CAMfiles_clean[[2]] <-
    CAMfiles_clean[[2]][!CAMfiles_clean[[2]]$participantCAM %in% vec_delete,]
  CAMfiles_clean[[3]] <-
    CAMfiles_clean[[3]][!CAMfiles_clean[[3]]$participantCAM.x %in% vec_delete,]

  #> drawn CAMs
  CAMdrawn_clean <- CAMdrawn[!names(CAMdrawn) %in% vec_delete]

  #> network indicators
  networkIndicators_clean <-
    networkIndicators[!networkIndicators$participantCAM %in% vec_delete,]


  ### 10% of participants have connected all basal attributes to the central one
barplot(
  table(
    networkIndicators_clean$degreetot_micro_AkzeptanzeinesneuenMaterialsystems
  ) / nrow(networkIndicators_clean)
)

  #> re-create wordlist
  CAMwordlist_clean <- create_wordlist(
    dat_nodes =  CAMfiles_clean[[1]],
    dat_merged =  CAMfiles_clean[[3]],
    useSummarized = TRUE,
    order = "frequency",
    splitByValence = FALSE,
    comments = TRUE,
    raterSubsetWords = NULL,
    rater = FALSE
  )

    #> questionnaire
  questionnaire_clean <-
    questionnaire[!questionnaire$PROLIFIC_PID %in% vec_delete,]

}
23 CAMs are removed from the data set, which results in 11.98 % loss of data

[1] "create_wordlist - use raw words"
[1] 0
[1] 5577
[1] "temporarily suffixes are added, because not all words have been summarized"
processing 169 CAMs... 
[1] "== participantCAM in drawnCAM"

7.4 aggregate CAMs

this is needed for MultilayerExtraction.

sel_ids <- unique(CAMfiles_clean[[1]]$participantCAM)
CAMaggregated_clean <-
  aggregate_CAMs(dat_merged = CAMfiles_clean[[3]],
                 dat_nodes = CAMfiles_clean[[1]],
                 ids_CAMs = sel_ids)
[1] "aggregate_CAMs: using participant CAM ids"
processing 169 CAMs... 
[1] "== participantCAM in drawnCAM"
CAMaggregated_clean[[1]][1:5, 1:5]
                                           Akzeptanz eines neuen Materialsystems
Akzeptanz eines neuen Materialsystems                                        169
Energie   speichernd                                                          27
wartungsintensiv                                                              63
aktive Formänderung durch Umwelteinwirkung                                    44
energieeffizient                                                              45
                                           Energie   speichernd
Akzeptanz eines neuen Materialsystems                        27
Energie   speichernd                                        169
wartungsintensiv                                              1
aktive Formänderung durch Umwelteinwirkung                    1
energieeffizient                                             59
                                           wartungsintensiv
Akzeptanz eines neuen Materialsystems                    63
Energie   speichernd                                      1
wartungsintensiv                                        169
aktive Formänderung durch Umwelteinwirkung                4
energieeffizient                                          4
                                           aktive Formänderung durch Umwelteinwirkung
Akzeptanz eines neuen Materialsystems                                              44
Energie   speichernd                                                                1
wartungsintensiv                                                                    4
aktive Formänderung durch Umwelteinwirkung                                        169
energieeffizient                                                                    1
                                           energieeffizient
Akzeptanz eines neuen Materialsystems                    45
Energie   speichernd                                     59
wartungsintensiv                                          4
aktive Formänderung durch Umwelteinwirkung                1
energieeffizient                                        169
## check for symmetry
all(colSums(CAMaggregated_clean[[5]][[1]]) == rowSums(CAMaggregated_clean[[5]][[1]]))
[1] TRUE
## test first concept of first CAM
colnames(CAMaggregated_clean[[5]][[1]])[1]
[1] "Akzeptanz eines neuen Materialsystems"
colnames(CAMaggregated_clean[[5]][[1]])[CAMaggregated_clean[[5]][[1]][1, ] >= 1]
[1] "nachhaltig"            "intelligent"           "lebensähnlich"        
[4] "Energie   generierend" "technologisch"        
rownames(CAMaggregated_clean[[5]][[1]])[CAMaggregated_clean[[5]][[1]][1, ] >= 1]
[1] "nachhaltig"            "intelligent"           "lebensähnlich"        
[4] "Energie   generierend" "technologisch"        
plot(
  make_ego_graph(CAMdrawn_clean[[1]], order = 1, 1)[[1]],
  edge.arrow.size = .7,
  layout = layout_nicely,
  vertex.frame.color = "black",
  asp = .5,
  margin = -0.1,
  vertex.size = 10,
  vertex.label.cex = .9,
  main = networkIndicators$CAM_ID[i]
)

## adjacency matrix (Aij = 1 when nodes i and j are connected and Aij = 0 otherwise)
CAMaggregated_clean_onlyOnes <- CAMaggregated_clean[[5]]
for(i in 1:length(CAMaggregated_clean_onlyOnes)){
  CAMaggregated_clean_onlyOnes[[i]][CAMaggregated_clean_onlyOnes[[i]] >= 1] <- 1
}

8 save all final files

setwd("outputs/01_dataPreperation/final")

if (length(vec_delete) > 1) {
  if (all(questionnaire_clean$PROLIFIC_PID == networkIndicators_clean$participantCAM) &
      all(questionnaire_clean$PROLIFIC_PID == unique(CAMfiles_clean[[1]]$participantCAM))) {
    # save questionnaire
    ## save as .xlsx file
    xlsx::write.xlsx2(x = questionnaire_clean, file = "questionnaire.xlsx")
    ## save as R object
    saveRDS(questionnaire_clean, file = "questionnaire.rds")

    # save network indicators combined and clean
    ## save as .xlsx file
    xlsx::write.xlsx2(x = networkIndicators_clean, file = "networkIndicators.xlsx")
    ## save as R object
    saveRDS(networkIndicators_clean, file = "networkIndicators.rds")

    # save CAMfiles
    ## save as R object
    saveRDS(CAMfiles_clean, file = "CAMfiles.rds")

    # save drawn CAMs
    ## save as R object
    saveRDS(CAMdrawn_clean, file = "CAMdrawn.rds")

    # save wordlist
    ## save as .xlsx file
    xlsx::write.xlsx2(x = CAMwordlist_clean, file = "CAMwordlist.xlsx")

    # save aggregated CAMs
    ## save as R object
    saveRDS(CAMaggregated_clean, file = "CAMaggregated.rds")

    # save adjacency matrix of aggregated CAM as CSV
    write.table(CAMaggregated_clean[[1]], file = "CAMaggregated_adj_matrix.csv",
              row.names = FALSE, col.names = TRUE)

    # export list of single-layer adjacency matrices to MATLAB for use with GenLouvain
    multigraph_adj_matrices_list <- CAMaggregated_clean[[5]]

    # R.matlab allows names with only <32 characters
    names(multigraph_adj_matrices_list) <- substr(names(multigraph_adj_matrices_list), start = 1, stop = 30)
    writeMat(con = "CAMaggregated_adj_matrices.mat", multigraph_adj_matrices_list = multigraph_adj_matrices_list)
    multigraph_adj_matrices_list <- CAMaggregated_clean_onlyOnes

    # R.matlab allows names with only <32 characters
    names(multigraph_adj_matrices_list) <- substr(names(multigraph_adj_matrices_list), start = 1, stop = 30)
    writeMat(con = "CAMaggregated_adj_matrices_onlyOnes.mat", multigraph_adj_matrices_list = multigraph_adj_matrices_list)

    cat("clean data sets have been sucessfully saved")
  }
}
clean data sets have been sucessfully saved