Data Preperation Climate Law Study

Author

Julius Fenn, Nina Frings

Background Information

This is an R Markdown document. Instructions for writing these documents and background information can be found in the book written by Xie, Allaire, and Grolemund (2018) When you execute code within the document, the results appear beneath the code.

The file include all analysis code for the published article XXX. In the article a multi-method approach was applied, whereby two modes of data collection - Cognitive-Affective Maps (CAM) and surveys - in accordance with two statistical procedures - graph theory (e.g., Newman 2018) and Latent Variable Models (e.g., Skrondal and Rabe-Hesketh 2004) - have been combined.

This file contains the pre-processing step (clean, transform data), whereby the folder “dataAnalysis” the analysis step (test hypotheses and exploratory analyses), which follows the classical data-analysis pipeline (see Peng and Matsui 2016; Wickham and Grolemund 2017).

Notes

Remark:

### global variables:

## for data cleaning CAMs
#  remove all CAMs for which no word can be found in a German dictionary
global_removeFakeCAMs = TRUE # ! keep to TRUE
# save fake CAMs as R object
global_saveFakeCAMs = TRUE # TRUE / FALSE


## for data cleaning CAMs
global_removeIPs_num = 3 # numeric value; remove all participants which IP is more then "global_removeIPs_num" times in the data set
# remove participants flagged as SPAM by Qualtrics
global_removeSpam = TRUE # TRUE / FALSE
# remove participants with multiple ids (greater / equal than global_removeIPs_num)
global_removeIPs = TRUE # TRUE / FALSE

load raw data

Load R packages, functions:

# sets the directory of location of this script as the current directory
# setwd(dirname(rstudioapi::getSourceEditorContext()$path))

### load packages
require(pacman)
p_load('tidyverse', 'jsonlite', 'magrittr', 'xlsx',
       'stargazer', 'psych', 'jtools', 'DT', 'ggstatsplot', 
       'lavaan', 'regsem', 'MplusAutomation', 'igraph', 'mapview')


### load functions
setwd("../functions")
for(i in 1:length(dir())){
  # print(dir()[i])
  source(dir()[i], encoding = "utf-8")
}


setwd("../functions_CAMapp")
for(i in 1:length(dir())){
  # print(dir()[i])
  source(dir()[i], encoding = "utf-8")
}
rm(i)

Load raw data:

Remark: The survey data was collected using the Qualtrics-Software and the CAM data was collected using the C.A.M.E.L. software (Fenn et al. 2023), hosted on a JATOS-server.

### survey data
## t1
survey_p1_t1 <- read.csv(file = "data/t1/Klimagesetz.wave1.Teil1.csv", encoding = "UTF-8")
survey_p1_t1 <- survey_p1_t1[-c(1,2), ]
survey_p2_t1 <- read.csv(file = "data/t1/Klimagesetz.wave1.Teil2.csv", encoding = "UTF-8")
survey_p2_t1 <- survey_p2_t1[-c(1,2), ]

## t2
survey_p1_t2 <- read.csv(file = "data/t2/Klimagesetz.wave2.Teil1.csv", encoding = "UTF-8")
survey_p1_t2 <- survey_p1_t2[-c(1,2), ]
survey_p2_t2 <- read.csv(file = "data/t2/Klimagesetz.wave2.Teil2.csv", encoding = "UTF-8")
survey_p2_t2 <- survey_p2_t2[-c(1,2), ]

## t2 control group
survey_p1_t2_controlGroup <- read.csv(file = "data/t2_controlGroup/Klimagesetz.control.Teil1.csv", encoding = "UTF-8")
survey_p1_t2_controlGroup <- survey_p1_t2_controlGroup[-c(1,2), ]
survey_p2_t2_controlGroup <- read.csv(file = "data/t2_controlGroup/Klimagesetz.control.Teil2.csv", encoding = "UTF-8")
survey_p2_t2_controlGroup <- survey_p2_t2_controlGroup[-c(1,2), ]


### CAM data
## t1
read_file('data/t1/t1_climateLawStudy.txt') %>%
  # ... split it into lines ...
  str_split('\n') %>% first() %>%
  # ... filter empty rows ...
  discard(function(x) x == '') -> dfCAM_t1

dfCAM_t1_list <- list()
for(i in 1:length(dfCAM_t1)){
  dfCAM_t1_list[[i]] <- jsonlite::fromJSON(txt = dfCAM_t1[[i]])
}

## t2
read_file('data/t2/t2_climateLawStudy.txt') %>%
  # ... split it into lines ...
  str_split('\n') %>% first() %>%
  # ... filter empty rows ...
  discard(function(x) x == '') -> dfCAM_t2

dfCAM_t2_list <- list()
for(i in 1:length(dfCAM_t2)){
  dfCAM_t2_list[[i]] <- jsonlite::fromJSON(txt = dfCAM_t2[[i]])
}

## data at t2 - cross-sectional (control group)
read_file('data/t2_controlGroup/t2_climateLawStudy_controlGroup.txt') %>%
  # ... split it into lines ...
  str_split('\n') %>% first() %>%
  # ... filter empty rows ...
  discard(function(x) x == '') -> dfCAM_t2_controlGroup

dfCAM_t2_controlGroup_list <- list()
for(i in 1:length(dfCAM_t2_controlGroup)){
  dfCAM_t2_controlGroup_list[[i]] <- jsonlite::fromJSON(txt = dfCAM_t2_controlGroup[[i]])
}


rm(dfCAM_t1)
rm(dfCAM_t2)
rm(dfCAM_t2_controlGroup)
rm(i)


### id data set to merge t1 and t2
df_IDs <- read.csv(file = "data/Match_participantID.csv", encoding = "UTF-8", sep=";")
df_IDs <- df_IDs[, c("participantID.wave1", "participantID.wave2")]
df_IDs <- unique(df_IDs)

prepare data

survey data

Consistent missing values and naming of variables

### all missing values to NA
## t1
survey_p1_t1[survey_p1_t1 == ""] <- NA
survey_p2_t1[survey_p2_t1 == ""] <- NA
## t2
survey_p1_t2[survey_p1_t2 == ""] <- NA
survey_p2_t2[survey_p2_t2 == ""] <- NA
## t2 control group
survey_p1_t2_controlGroup[survey_p1_t2_controlGroup == ""] <- NA
survey_p2_t2_controlGroup[survey_p2_t2_controlGroup == ""] <- NA


### rename not consistently named variables
colnames(survey_p1_t1)[colnames(survey_p1_t1) == "u"] <- "participantID"

colnames(survey_p1_t2)[colnames(survey_p1_t2) == "u"] <- "participantID"

colnames(survey_p1_t2_controlGroup)[colnames(survey_p1_t2_controlGroup) == "attention_check"] <- "attention.check"
colnames(survey_p1_t2_controlGroup)[colnames(survey_p1_t2_controlGroup) == "u"] <- "participantID"


### rename not consistently named variable labels
# Finished
survey_p1_t2$Finished <- ifelse(test = survey_p1_t2$Finished == "Wahr", yes = TRUE, no = FALSE) # german
survey_p2_t2$Finished <- ifelse(test = survey_p2_t2$Finished == "Wahr", yes = TRUE, no = FALSE)

survey_p1_t2_controlGroup$Finished <- ifelse(test = survey_p1_t2_controlGroup$Finished == "Wahr", yes = TRUE, no = FALSE)
survey_p2_t2_controlGroup$Finished <- ifelse(test = survey_p2_t2_controlGroup$Finished == "Wahr", yes = TRUE, no = FALSE)

# education
survey_p1_t1$education[survey_p1_t1$education == "3" & !is.na(survey_p1_t1$education)] <- NA
survey_p1_t2_controlGroup$education[survey_p1_t2_controlGroup$education == "kein Abschluss" & 
                                      !is.na(survey_p1_t2_controlGroup$education)] <- "Kein Abschluss"

### remove all variables not needed (mainly internal Qualtrics variables)
remove_vars <- c("RecordedDate", "ResponseId", "RecipientLastName", "RecipientFirstName", "RecipientEmail", "ExternalReference", "DistributionChannel", "UserLanguage")

## t1
survey_p1_t1 <- survey_p1_t1[ , ! names(survey_p1_t1) %in% remove_vars]
survey_p2_t1 <- survey_p2_t1[ , ! names(survey_p2_t1) %in% c(remove_vars, "u")] # u = NA

## t2
survey_p1_t2 <- survey_p1_t2[ , ! names(survey_p1_t2) %in% remove_vars]
survey_p2_t2 <- survey_p2_t2[ , ! names(survey_p2_t2) %in% c(remove_vars, "u")] # u = NA

## t2 control group
survey_p1_t2_controlGroup <- survey_p1_t2_controlGroup[ , ! names(survey_p1_t2_controlGroup) %in% remove_vars]
survey_p2_t2_controlGroup <- survey_p2_t2_controlGroup[ , ! names(survey_p2_t2_controlGroup) %in% c(remove_vars, "u")] # u = NA

rm(remove_vars)

Set variable types:

### everything is a string - set variable types
## t1 START ##
#> part 1

# str(survey_p1_t1)
survey_p1_t1$Progress <- as.numeric(survey_p1_t1$Progress)

survey_p1_t1$Duration..in.seconds. <- as.numeric(survey_p1_t1$Duration..in.seconds.)
colnames(survey_p1_t1)[colnames(survey_p1_t1) == "Duration..in.seconds."] <- "durationSeconds"

survey_p1_t1$Finished <- ifelse(test = survey_p1_t1$Finished == "True", yes = TRUE, no = FALSE)

survey_p1_t1$LocationLongitude <- as.numeric(survey_p1_t1$LocationLongitude)
survey_p1_t1$LocationLatitude <- as.numeric(survey_p1_t1$LocationLatitude)

survey_p1_t1$consent <- ifelse(test = survey_p1_t1$consent  == "Ja1,Ja2", yes = 1, no = 0)  # to dummy variable

survey_p1_t1$gender <- as.factor(x = survey_p1_t1$gender)

survey_p1_t1$age <- as.numeric(survey_p1_t1$age)

survey_p1_t1$education <- ordered(x = survey_p1_t1$education, levels = c("Kein Abschluss", "Obligatorische Schule", "Hochschulreife / Matur", "Bachelor/ Master/ Promotion (Dr. / PhD)"))


#> part 2
# str(survey_p2_t1)

survey_p2_t1$Progress <- as.numeric(survey_p2_t1$Progress)

survey_p2_t1$Duration..in.seconds. <- as.numeric(survey_p2_t1$Duration..in.seconds.)
colnames(survey_p2_t1)[colnames(survey_p2_t1) == "Duration..in.seconds."] <- "durationSeconds"

survey_p2_t1$Finished <- ifelse(test = survey_p2_t1$Finished == "True", yes = TRUE, no = FALSE)

survey_p2_t1$LocationLongitude <- as.numeric(survey_p2_t1$LocationLongitude)
survey_p2_t1$LocationLatitude <- as.numeric(survey_p2_t1$LocationLatitude)

survey_p2_t1$vote.co2[str_detect(string = survey_p2_t1$vote.co2, pattern = "für ja")] <- 1 # to dummy variable
survey_p2_t1$vote.co2[str_detect(string = survey_p2_t1$vote.co2, pattern = "für nein")] <- 0
survey_p2_t1$vote.co2[str_detect(string = survey_p2_t1$vote.co2, pattern = "keine Angabe")] <- NA 
survey_p2_t1$vote.co2 <- as.numeric(survey_p2_t1$vote.co2)
colnames(survey_p2_t1)[colnames(survey_p2_t1) == "vote.co2"] <- "voteCo2"

survey_p2_t1$intended.vote <- ifelse(test = survey_p2_t1$intended.vote == "Ja", yes = 1, no = 0) # to dummy variable
colnames(survey_p2_t1)[colnames(survey_p2_t1) == "intended.vote"] <- "intendedVote"

survey_p2_t1$rating_positive_1 <- as.numeric(survey_p2_t1$rating_positive_1)
colnames(survey_p2_t1)[colnames(survey_p2_t1) == "rating_positive_1"] <- "ratingPositive"

survey_p2_t1$rating_favor_1 <- as.numeric(survey_p2_t1$rating_favor_1)
colnames(survey_p2_t1)[colnames(survey_p2_t1) == "rating_favor_1"] <- "ratingFavor"

survey_p2_t1$politicalorientation_1 <- as.numeric(survey_p2_t1$politicalorientation_1)
colnames(survey_p2_t1)[colnames(survey_p2_t1) == "politicalorientation_1"] <- "politicalOrientation"

survey_p2_t1$Q48  <- ordered(x = survey_p2_t1$Q48, levels = c("sehr unrepräsentativ",
                                                                                  "etwas unrepräsentativ",
                                                                                  "ein bisschen unrepräsentativ",
                                                                                  "weder unrepräsentativ noch repräsentativ",
                                                                                  "ein bisschen repräsentativ",
                                                                                  "eher repräsentativ",
                                                                                  "sehr repräsentativ"))
colnames(survey_p2_t1)[colnames(survey_p2_t1) == "Q48"] <- "representative"
## t1 END ##



## t2 START ##
#> part 1
# str(survey_p1_t2)

survey_p1_t2$Progress <- as.numeric(survey_p1_t2$Progress)

survey_p1_t2$Duration..in.seconds. <- as.numeric(survey_p1_t2$Duration..in.seconds.)
colnames(survey_p1_t2)[colnames(survey_p1_t2) == "Duration..in.seconds."] <- "durationSeconds"

survey_p1_t2$LocationLongitude <- as.numeric(survey_p1_t2$LocationLongitude)
survey_p1_t2$LocationLatitude <- as.numeric(survey_p1_t2$LocationLatitude)

survey_p1_t2$consent[str_detect(string = survey_p1_t2$consent, pattern = "Die Daten dürfen für wissenschaftliche Zwecke genutzt und die Ergebnisse der Studie in wissenschaftlichen Zeitschriften oder Büchern veröffentlicht werden")] <- 1 # to dummy variable
survey_p1_t2$consent <- as.numeric(survey_p1_t2$consent)


#> part 2

# str(survey_p2_t2)
survey_p2_t2$Progress <- as.numeric(survey_p2_t2$Progress)

survey_p2_t2$Duration..in.seconds. <- as.numeric(survey_p2_t2$Duration..in.seconds.)
colnames(survey_p2_t2)[colnames(survey_p2_t2) == "Duration..in.seconds."] <- "durationSeconds"

survey_p2_t2$LocationLongitude <- as.numeric(survey_p2_t2$LocationLongitude)
survey_p2_t2$LocationLatitude <- as.numeric(survey_p2_t2$LocationLatitude)

survey_p2_t2$intended.vote[str_detect(string = survey_p2_t2$intended.vote, pattern = "Nein")] <- 0  # to dummy variable
survey_p2_t2$intended.vote[str_detect(string = survey_p2_t2$intended.vote, pattern = "Ja")] <- 1
survey_p2_t2$intended.vote <- as.numeric(survey_p2_t2$intended.vote)
colnames(survey_p2_t2)[colnames(survey_p2_t2) == "intended.vote"] <- "intendedVote"

survey_p2_t2$rating_positive_1 <- as.numeric(survey_p2_t2$rating_positive_1)
colnames(survey_p2_t2)[colnames(survey_p2_t2) == "rating_positive_1"] <- "ratingPositive"

survey_p2_t2$rating_favor_1 <- as.numeric(survey_p2_t2$rating_favor_1)
colnames(survey_p2_t2)[colnames(survey_p2_t2) == "rating_favor_1"] <- "ratingFavor"

survey_p2_t2$party_identification <- as.factor(x = survey_p2_t2$party_identification)
colnames(survey_p2_t2)[colnames(survey_p2_t2) == "party_identification"] <- "partyIdentification"

colnames(survey_p2_t2)[colnames(survey_p2_t2) == "party_identification_7_TEXT"] <- "partyIdentification_text"


survey_p2_t2$cc_concerns_1 <- ordered(x = survey_p2_t2$cc_concerns_1, levels = c("stimme überhaupt nicht zu",
                                                                                 "stimme nicht zu",
                                                                                 "stimme eher nicht zu",
                                                                                 "stimme eher zu",
                                                                                 "stimme zu",
                                                                                 "stimme voll und ganz zu"))
survey_p2_t2$cc_concerns_2 <- ordered(x = survey_p2_t2$cc_concerns_2, levels = c("stimme überhaupt nicht zu",
                                                                                 "stimme nicht zu",
                                                                                 "stimme eher nicht zu",
                                                                                 "stimme eher zu",
                                                                                 "stimme zu",
                                                                                 "stimme voll und ganz zu"))
survey_p2_t2$cc_concerns_3 <- ordered(x = survey_p2_t2$cc_concerns_3, levels = c("stimme überhaupt nicht zu",
                                                                                 "stimme nicht zu",
                                                                                 "stimme eher nicht zu",
                                                                                 "stimme eher zu",
                                                                                 "stimme zu",
                                                                                 "stimme voll und ganz zu"))
survey_p2_t2$cc_concerns_4 <- ordered(x = survey_p2_t2$cc_concerns_4, levels = c("stimme überhaupt nicht zu",
                                                                                 "stimme nicht zu",
                                                                                 "stimme eher nicht zu",
                                                                                 "stimme eher zu",
                                                                                 "stimme zu",
                                                                                 "stimme voll und ganz zu"))


survey_p2_t2$engagement_1 <- ordered(x = survey_p2_t2$engagement_1, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))
survey_p2_t2$engagement_2 <- ordered(x = survey_p2_t2$engagement_2, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))
survey_p2_t2$engagement_3 <- ordered(x = survey_p2_t2$engagement_3, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))
survey_p2_t2$engagement_4 <- ordered(x = survey_p2_t2$engagement_4, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))
survey_p2_t2$engagement_5  <- ordered(x = survey_p2_t2$engagement_5, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))


colnames(survey_p2_t2)[colnames(survey_p2_t2) == "media_6_TEXT"] <- "media1_text"
colnames(survey_p2_t2)[colnames(survey_p2_t2) == "media_7_TEXT"] <- "media2_text"

survey_p2_t2$repräsentativ  <- ordered(x = survey_p2_t2$repräsentativ, levels = c("sehr unrepräsentativ",
                                                                                  "etwas unrepräsentativ",
                                                                                  "ein bisschen unrepräsentativ",
                                                                                  "weder unrepräsentativ noch repräsentativ",
                                                                                  "ein bisschen repräsentativ",
                                                                                  "eher repräsentativ",
                                                                                  "sehr repräsentativ"))
colnames(survey_p2_t2)[colnames(survey_p2_t2) == "repräsentativ"] <- "representative"
## t2 END ##


## t2 control group START ##
#> part 1

# str(survey_p1_t2_controlGroup)
survey_p1_t2_controlGroup$Progress <- as.numeric(survey_p1_t2_controlGroup$Progress)

survey_p1_t2_controlGroup$Duration..in.seconds. <- as.numeric(survey_p1_t2_controlGroup$Duration..in.seconds.)
colnames(survey_p1_t2_controlGroup)[colnames(survey_p1_t2_controlGroup) == "Duration..in.seconds."] <- "durationSeconds"

# unique(survey_p1_t2_controlGroup$consent) 
### You could only continue if you gave consent.. so we only have people that gave their consent in the dataset
survey_p1_t2_controlGroup$consent <- 1

survey_p1_t2_controlGroup$gender <- as.factor(x = survey_p1_t2_controlGroup$gender)

survey_p1_t2_controlGroup$age <- as.numeric(survey_p1_t2_controlGroup$age)

survey_p1_t2_controlGroup$education <- ordered(x = survey_p1_t2_controlGroup$education, levels = c("Kein Abschluss", "Obligatorische Schule", "Hochschulreife / Matur", "Bachelor/ Master/ Promotion (Dr. / PhD)"))

#> part 2
# str(survey_p2_t2_controlGroup)

survey_p2_t2_controlGroup$Progress <- as.numeric(survey_p2_t2_controlGroup$Progress)

survey_p2_t2_controlGroup$Duration..in.seconds. <- as.numeric(survey_p2_t2_controlGroup$Duration..in.seconds.)
colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "Duration..in.seconds."] <- "durationSeconds"

survey_p2_t2_controlGroup$intended_vote[str_detect(string = survey_p2_t2_controlGroup$intended_vote, pattern = "Nein")] <- 0  # to dummy variable
survey_p2_t2_controlGroup$intended_vote[str_detect(string = survey_p2_t2_controlGroup$intended_vote, pattern = "Ja")] <- 1
survey_p2_t2_controlGroup$intended_vote <- as.numeric(survey_p2_t2_controlGroup$intended_vote)
colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "intended_vote"] <- "intendedVote"

survey_p2_t2_controlGroup$rating_positive_1 <- as.numeric(survey_p2_t2_controlGroup$rating_positive_1)
colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "rating_positive_1"] <- "ratingPositive"

survey_p2_t2_controlGroup$rating_favor_1 <- as.numeric(survey_p2_t2_controlGroup$rating_favor_1)
colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "rating_favor_1"] <- "ratingFavor"

survey_p2_t2_controlGroup$politicalorientation_1 <- as.numeric(survey_p2_t2_controlGroup$politicalorientation_1)
colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "politicalorientation_1"] <- "politicalOrientation"

survey_p2_t2_controlGroup$party_identification <- as.factor(x = survey_p2_t2_controlGroup$party_identification)
colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "party_identification"] <- "partyIdentification"

colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "party_identification_7_TEXT"] <- "partyIdentification_text"


survey_p2_t2_controlGroup$cc_concerns_1 <- ordered(x = survey_p2_t2_controlGroup$cc_concerns_1, levels = c("stimme überhaupt nicht zu",
                                                                                 "stimme nicht zu",
                                                                                 "stimme eher nicht zu",
                                                                                 "stimme eher zu",
                                                                                 "stimme zu",
                                                                                 "stimme voll und ganz zu"))
survey_p2_t2_controlGroup$cc_concerns_2 <- ordered(x = survey_p2_t2_controlGroup$cc_concerns_2, levels = c("stimme überhaupt nicht zu",
                                                                                 "stimme nicht zu",
                                                                                 "stimme eher nicht zu",
                                                                                 "stimme eher zu",
                                                                                 "stimme zu",
                                                                                 "stimme voll und ganz zu"))
survey_p2_t2_controlGroup$cc_concerns_3 <- ordered(x = survey_p2_t2_controlGroup$cc_concerns_3, levels = c("stimme überhaupt nicht zu",
                                                                                 "stimme nicht zu",
                                                                                 "stimme eher nicht zu",
                                                                                 "stimme eher zu",
                                                                                 "stimme zu",
                                                                                 "stimme voll und ganz zu"))
survey_p2_t2_controlGroup$cc_concerns_4 <- ordered(x = survey_p2_t2_controlGroup$cc_concerns_4, levels = c("stimme überhaupt nicht zu",
                                                                                 "stimme nicht zu",
                                                                                 "stimme eher nicht zu",
                                                                                 "stimme eher zu",
                                                                                 "stimme zu",
                                                                                 "stimme voll und ganz zu"))


survey_p2_t2_controlGroup$engagement_1 <- ordered(x = survey_p2_t2_controlGroup$engagement_1, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))
survey_p2_t2_controlGroup$engagement_2 <- ordered(x = survey_p2_t2_controlGroup$engagement_2, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))
survey_p2_t2_controlGroup$engagement_3 <- ordered(x = survey_p2_t2_controlGroup$engagement_3, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))
survey_p2_t2_controlGroup$engagement_4 <- ordered(x = survey_p2_t2_controlGroup$engagement_4, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))
survey_p2_t2_controlGroup$engagement_5  <- ordered(x = survey_p2_t2_controlGroup$engagement_5, levels = c("Gar nicht",
                                                                                "Für circa 1-2 Stunden im letzten Monat",
                                                                                "Für circa 2-3 Stunden im letzten Monat",
                                                                                "Für circa 3-4 Stunden im letzten Monat",
                                                                                "Für circa 4-5 Stunden im letzten Monat",
                                                                                "Für circa 5-6 Stunden im letzten Monat",
                                                                                "Für circa 6-7 Stunden im letzten Monat", 
                                                                                "Für mehr als 7 Stunden im letzten Monat"))


colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "media_6_TEXT"] <- "media1_text"
colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "media_7_TEXT"] <- "media2_text"

survey_p2_t2_controlGroup$repräsentativ  <- ordered(x = survey_p2_t2_controlGroup$repräsentativ, levels = c("sehr unrepräsentativ",
                                                                                                            "etwas unrepräsentativ",
                                                                                                            "ein bisschen unrepräsentativ",
                                                                                                            "weder unrepräsentativ noch repräsentativ",
                                                                                                            "ein bisschen repräsentativ",
                                                                                                            "eher repräsentativ",
                                                                                                            "sehr repräsentativ"))
colnames(survey_p2_t2_controlGroup)[colnames(survey_p2_t2_controlGroup) == "repräsentativ"] <- "representative"
## t2 control group END ##

Remove given test ID and missing IDs:

### remove test IDs
## t1
# ID was removed manually before study start

## t2
survey_p1_t2 <- survey_p1_t2 %>% filter(!participantID=="ddde09e8ac4604983c2e313df3672338")
survey_p2_t2 <- survey_p2_t2 %>% filter(!participantID=="ddde09e8ac4604983c2e313df3672338")

## t2 control group
survey_p1_t2_controlGroup <- survey_p1_t2_controlGroup %>% filter(!participantID=="523aa001f490a2e04b04eac38f3bf6f9")
survey_p2_t2_controlGroup <- survey_p2_t2_controlGroup %>% filter(!participantID=="523aa001f490a2e04b04eac38f3bf6f9")

### remove missing IDs
## t1
sum(is.na(survey_p1_t1$participantID))
[1] 14
survey_p1_t1 <- survey_p1_t1[! is.na(survey_p1_t1$participantID), ]
sum(is.na(survey_p2_t1$participantID))
[1] 0
survey_p2_t1 <- survey_p2_t1[! is.na(survey_p2_t1$participantID), ]
## t2
sum(is.na(survey_p1_t2$participantID))
[1] 0
survey_p1_t2 <- survey_p1_t2[! is.na(survey_p1_t2$participantID), ]
sum(is.na(survey_p2_t2$participantID))
[1] 0
survey_p2_t2 <- survey_p2_t2[! is.na(survey_p2_t2$participantID), ]
## t2 control group
sum(is.na(survey_p1_t2_controlGroup$participantID))
[1] 0
survey_p1_t2_controlGroup <- survey_p1_t2_controlGroup[! is.na(survey_p1_t2_controlGroup$participantID), ]
sum(is.na(survey_p2_t2_controlGroup$participantID))
[1] 0
survey_p2_t2_controlGroup <- survey_p2_t2_controlGroup[! is.na(survey_p2_t2_controlGroup$participantID), ]

CAM data

create and draw CAMs and remove missing and test ids:

### create CAM files
## t1
CAM_files_t1 <- create_CAMfiles(datCAM = dfCAM_t1_list, reDeleted = TRUE, verbose = FALSE)
following CAM containing zero nodes:  
following CAM containing zero nodes:  
following CAM containing zero nodes:  
following CAM containing zero nodes:  c8218c4b-32ac-4dd6-8c85-0c79f23e6847 
following CAM containing zero nodes:  
following CAM containing zero nodes:  e5dc6173-5307-4919-8f49-42ed8eac672e 
following CAM containing zero nodes:  
following CAM containing zero nodes:  183e5908-1dff-4261-a38f-398fe65cb689 
Nodes and connectors, which were deleted by participants were removed. 
 # deleted nodes:  1236 
 # deleted connectors:  185
## t2
CAM_files_t2 <- create_CAMfiles(datCAM = dfCAM_t2_list, reDeleted = TRUE, verbose = FALSE)
following CAM containing zero nodes:  61d280e0-44fb-42a1-bf48-5291651f7dbc 
Nodes and connectors, which were deleted by participants were removed. 
 # deleted nodes:  562 
 # deleted connectors:  92
## t2 control group
CAM_files_t2_controlGroup <- create_CAMfiles(datCAM = dfCAM_t2_controlGroup_list, reDeleted = TRUE, verbose = FALSE)
following CAM containing zero nodes:  
Nodes and connectors, which were deleted by participants were removed. 
 # deleted nodes:  125 
 # deleted connectors:  20
# rm(dfCAM_t1_list)
# rm(dfCAM_t2_list)
# rm(dfCAM_t2_controlGroup_list)

### remove CAMs with no valid ID
## t1
table(CAM_files_t1[[1]]$participantCAM[nchar(CAM_files_t1[[1]]$participantCAM) != 32])

c5c36c24-5225-45c8-9e12-f3ca25deb52d                       NO ID PROVIDED 
                                   1                                  466 
CAM_files_t1[[1]] <- CAM_files_t1[[1]][nchar(CAM_files_t1[[1]]$participantCAM) == 32, ]
CAM_files_t1[[3]] <- CAM_files_t1[[3]][nchar(CAM_files_t1[[3]]$participantCAM.x) == 32, ]
## t2
table(CAM_files_t2[[1]]$participantCAM[nchar(CAM_files_t2[[1]]$participantCAM) != 32])

         11712          25522 NO ID PROVIDED           noID 
             9             10             37              9 
CAM_files_t2[[1]] <- CAM_files_t2[[1]][nchar(CAM_files_t2[[1]]$participantCAM) == 32, ]
CAM_files_t2[[3]] <- CAM_files_t2[[3]][nchar(CAM_files_t2[[3]]$participantCAM.x) == 32, ]
## t2 control group
table(CAM_files_t2_controlGroup[[1]]$participantCAM[nchar(CAM_files_t2_controlGroup[[1]]$participantCAM) != 32])

                        25504          29279          41567 NO ID PROVIDED 
             8              9              9             10            105 
          noID 
            18 
CAM_files_t2_controlGroup[[1]] <- CAM_files_t2_controlGroup[[1]][nchar(CAM_files_t2_controlGroup[[1]]$participantCAM) == 32, ]
CAM_files_t2_controlGroup[[3]] <- CAM_files_t2_controlGroup[[3]][nchar(CAM_files_t2_controlGroup[[3]]$participantCAM.x) == 32, ]


### draw CAMs
## t1
CAMdrawn_t1 <- draw_CAM(dat_merged = CAM_files_t1[[3]],
                     dat_nodes = CAM_files_t1[[1]],ids_CAMs = "all", plot_CAM = FALSE,
                     useCoordinates = TRUE,
                     relvertexsize = 3,
                     reledgesize = 1)
processing 575 CAMs... 
[1] "== participantCAM in drawnCAM"
## t2
CAMdrawn_t2 <- draw_CAM(dat_merged = CAM_files_t2[[3]],
                     dat_nodes = CAM_files_t2[[1]],ids_CAMs = "all", plot_CAM = FALSE,
                     useCoordinates = TRUE,
                     relvertexsize = 3,
                     reledgesize = 1)
processing 322 CAMs... 
[1] "== participantCAM in drawnCAM"
## t2 control group
CAMdrawn_t2_controlGroup <- draw_CAM(dat_merged = CAM_files_t2_controlGroup[[3]],
                     dat_nodes = CAM_files_t2_controlGroup[[1]],ids_CAMs = "all", plot_CAM = FALSE,
                     useCoordinates = TRUE,
                     relvertexsize = 3,
                     reledgesize = 1)
processing 83 CAMs... 
[1] "== participantCAM in drawnCAM"

clean data

CAM data

identify fake CAMs (only deleted if “global_removeFakeCAMs”=TRUE)

  1. load German dictionary
### load German dictionary
setwd("data dictionaries")
## german
dict_german <- readr::read_table(file = "de_frami.txt", col_names = TRUE)

dict_german <- str_subset(string = dict_german$words, pattern = "/")
dict_german <- str_remove_all(string = dict_german, pattern = "/.*$")
dict_german <- str_subset(string = dict_german, pattern = "^[[:alpha:]]{2,}")
dict_german <- c(dict_german, "globale", "KOGNITIVE", "Modelle", "Nutzeneffekte", "Faktoren", "Soziale")
dict_german <- tolower(x = dict_german)
dict_german <- dict_german[nchar(x = dict_german) >= 5]
dict_german <- unique(dict_german)


dict_german <- c("nur", "elektro", "teuerung", "klimaschutz", "klima", "co2", "subventionen", "Stromfresser", "Landwirtschaft", "veränderungen","stromfresser", "Treibhausgase","Stromversorgung", "Veränderungen", "Subventionen","Klimaneutralität", "Umweltschutz", "Energiesparen", "erneuerbare", "Strom", dict_german)
dict_german <- unique(dict_german)


## to swiss german
dict_german <- str_replace_all(string = dict_german, pattern = "ß", replacement = "ss")
  1. check if at least one word within the dictionary can be found in a CAM
### run dictionary approach
## identify fake CAMs using a dictionary
CAMdrawn_t1_fake <- identifyFakeCAMs(CAMdrawn = CAMdrawn_t1)
CAMdrawn_t2_fake <- identifyFakeCAMs(CAMdrawn = CAMdrawn_t2)
CAMdrawn_t2_controlGroup_fake <-
  identifyFakeCAMs(CAMdrawn = CAMdrawn_t2_controlGroup)

if (global_saveFakeCAMs) {
  # to check all so identified CAMs manually
  setwd("checkFakeCAMs")
  saveRDS(CAMdrawn_t1, file = "CAMdrawn_t1.rds")
  saveRDS(CAMdrawn_t2, file = "CAMdrawn_t2.rds")
  saveRDS(CAMdrawn_t2_controlGroup, file = "CAMdrawn_t2_controlGroup.rds")
  
  saveRDS(CAMdrawn_t1_fake, file = "CAMdrawn_t1_fake.rds")
  saveRDS(CAMdrawn_t2_fake, file = "CAMdrawn_t2_fake.rds")
  saveRDS(CAMdrawn_t2_controlGroup_fake, file = "CAMdrawn_t2_controlGroup_fake.rds")
}

fake CAMs were also visually checked in additional files (see folder “checkFakeCAMs”)

  1. remove all so identified fake CAMs (only deleted if “global_removeFakeCAMs”=TRUE)
### remove identified fake CAMs

CAMdrawn_t1_fake$ID_fakeCAMs <-
  CAMdrawn_t1_fake$ID_fakeCAMs[!CAMdrawn_t1_fake$ID_fakeCAMs %in%
                                 c(
                                   "18bb2e604239a661fe6e620fb3c19bfd",
                                   "674e92e1ad74d74b5eae470bd364556d",
                                   "90504081eb1ac2bf8761805ea2511948",
                                   "60ac1c4f664ed5acd4abde2641d83858"
                                 )]


length(CAMdrawn_t1_fake$ID_fakeCAMs) / length(CAMdrawn_t1)
[1] 0.2104348
length(CAMdrawn_t2_fake$ID_fakeCAMs) / length(CAMdrawn_t2)
[1] 0.1583851
length(CAMdrawn_t2_controlGroup_fake$ID_fakeCAMs) / length(CAMdrawn_t2_controlGroup)
[1] 0.1325301
if (global_removeFakeCAMs) {
  #> t1
  CAMdrawn_t1_clean <-
    CAMdrawn_t1[!names(CAMdrawn_t1) %in% CAMdrawn_t1_fake$ID_fakeCAMs]
  
  #> t2
  CAMdrawn_t2_clean <-
    CAMdrawn_t2[!names(CAMdrawn_t2) %in% CAMdrawn_t2_fake$ID_fakeCAMs]
  
  #> t2 control group
  CAMdrawn_t2_controlGroup_clean <-
    CAMdrawn_t2_controlGroup[!names(CAMdrawn_t2_controlGroup) %in% CAMdrawn_t2_controlGroup_fake$ID_fakeCAMs]
  
  rm(CAMdrawn_t1_fake)
  rm(CAMdrawn_t2_fake)
  rm(CAMdrawn_t2_controlGroup_fake)
}

compute network indicators

### CAM network indicators
if(global_removeFakeCAMs){
CAM_indicators_t1 <- compute_indicatorsCAM(drawn_CAM = CAMdrawn_t1_clean,
                                       micro_degree = c("Klimagesetz"),
                                       micro_valence = c("Klimagesetz"),
                                       micro_centr_clo = c("Klimagesetz"))

CAM_indicators_t2 <- compute_indicatorsCAM(drawn_CAM = CAMdrawn_t2_clean,
                                       micro_degree = c("Klimagesetz"),
                                       micro_valence = c("Klimagesetz"),
                                       micro_centr_clo = c("Klimagesetz"))

CAM_indicators_t2_controlGroup <- compute_indicatorsCAM(drawn_CAM = CAMdrawn_t2_controlGroup_clean,
                                       micro_degree = c("Klimagesetz"),
                                       micro_valence = c("Klimagesetz"),
                                       micro_centr_clo = c("Klimagesetz"))
}else{
  CAM_indicators_t1 <- compute_indicatorsCAM(drawn_CAM = CAMdrawn_t1,
                                       micro_degree = c("Klimagesetz"),
                                       micro_valence = c("Klimagesetz"),
                                       micro_centr_clo = c("Klimagesetz"))

CAM_indicators_t2 <- compute_indicatorsCAM(drawn_CAM = CAMdrawn_t2,
                                       micro_degree = c("Klimagesetz"),
                                       micro_valence = c("Klimagesetz"),
                                       micro_centr_clo = c("Klimagesetz"))

CAM_indicators_t2_controlGroup <- compute_indicatorsCAM(drawn_CAM = CAMdrawn_t2_controlGroup,
                                       micro_degree = c("Klimagesetz"),
                                       micro_valence = c("Klimagesetz"),
                                       micro_centr_clo = c("Klimagesetz"))
}


# !!!
sum(is.na(CAM_indicators_t1$valence_micro_Klimagesetz))
[1] 5
sum(is.na(CAM_indicators_t2$valence_micro_Klimagesetz))
[1] 4
sum(is.na(CAM_indicators_t2_controlGroup$valence_micro_Klimagesetz))
[1] 0

survey data

Keep only persons how passed the attention check:

sum(survey_p1_t1$attention.check != "Tennisspielen", na.rm = TRUE)
[1] 337
survey_p1_t1 <- survey_p1_t1[survey_p1_t1$attention.check == "Tennisspielen", ]

sum(survey_p1_t2$attention.check != "Tennisspielen", na.rm = TRUE)
[1] 20
survey_p1_t2 <- survey_p1_t2[survey_p1_t2$attention.check == "Tennisspielen", ]

sum(survey_p1_t2_controlGroup$attention.check != "Tennisspielen", na.rm = TRUE)
[1] 56
survey_p1_t2_controlGroup <- survey_p1_t2_controlGroup[survey_p1_t2_controlGroup$attention.check == "Tennisspielen", ]

Before data is cleaned according to different criteria number of persons who participated in study:

## t1
nrow(survey_p1_t1)
[1] 3372
nrow(survey_p2_t1)
[1] 570
## t2
nrow(survey_p1_t2)
[1] 471
nrow(survey_p2_t2)
[1] 325
## t2 control group
nrow(survey_p1_t2_controlGroup)
[1] 656
nrow(survey_p2_t2_controlGroup)
[1] 83

participants are flagged according to different criteria: (only deleted if any of “global_removeX”=TRUE)

  1. flag participants marked as spam (internal variable by Qualtrics):
## t1
sum(survey_p1_t1$Status == "Spam", na.rm = TRUE)
[1] 36
survey_p1_t1$check_SPAM <- ifelse(test = survey_p1_t1$Status == "Spam", yes = TRUE, no = FALSE)

sum(survey_p2_t1$Status == "Spam", na.rm = TRUE)
[1] 56
survey_p2_t1$check_SPAM <- ifelse(test = survey_p2_t1$Status == "Spam", yes = TRUE, no = FALSE)


## t2
sum(survey_p1_t2$Status == "Spam", na.rm = TRUE)
[1] 46
survey_p1_t2$check_SPAM <- ifelse(test = survey_p1_t2$Status == "Spam", yes = TRUE, no = FALSE)

sum(survey_p2_t2$Status == "Spam", na.rm = TRUE)
[1] 0
survey_p2_t2$check_SPAM <- ifelse(test = survey_p2_t2$Status == "Spam", yes = TRUE, no = FALSE)


## t2 control group
sum(survey_p1_t2_controlGroup$Status == "Spam", na.rm = TRUE)
[1] 9
survey_p1_t2_controlGroup$check_SPAM <- ifelse(test = survey_p1_t2_controlGroup$Status == "Spam", yes = TRUE, no = FALSE)

sum(survey_p2_t2_controlGroup$Status == "Spam", na.rm = TRUE)
[1] 0
survey_p2_t2_controlGroup$check_SPAM <- ifelse(test = survey_p2_t2_controlGroup$Status == "Spam", yes = TRUE, no = FALSE)
  1. flag participants with multiple identical IPs (if IPs are greater / equal then 3):
sort(table(survey_p1_t1$IPAddress), decreasing = TRUE)[1:10]

   45.90.58.145  179.43.145.229  179.43.145.251    45.90.57.220   179.43.146.25 
             47              32              30              29              24 
 179.43.145.232  179.43.176.203 195.206.105.194   84.227.217.18    31.10.147.91 
             21              21              13               7               5 
## t1
sum(table(survey_p1_t1$IPAddress) >= global_removeIPs_num) # number of IPs greater / equal then "global_removeIPs_num"
[1] 45
tmp_IP <-
  names(table(survey_p1_t1$IPAddress))[table(survey_p1_t1$IPAddress) >= global_removeIPs_num]
sum(survey_p1_t1$IPAddress %in% tmp_IP) # number of participants (rows) greater / equal then "global_removeIPs_num"
[1] 341
survey_p1_t1$check_IP <- survey_p1_t1$IPAddress %in% tmp_IP

sum(table(survey_p2_t1$IPAddress) >= global_removeIPs_num)
[1] 12
tmp_IP <-
  names(table(survey_p2_t1$IPAddress))[table(survey_p2_t1$IPAddress) >= global_removeIPs_num]
sum(survey_p2_t1$IPAddress %in% tmp_IP)
[1] 133
survey_p2_t1$check_IP <- survey_p2_t1$IPAddress %in% tmp_IP


## t2
sum(table(survey_p1_t2$IPAddress) >= global_removeIPs_num)
[1] 12
tmp_IP <-
  names(table(survey_p1_t2$IPAddress))[table(survey_p1_t2$IPAddress) >= global_removeIPs_num]
sum(survey_p1_t2$IPAddress %in% tmp_IP)
[1] 69
survey_p1_t2$check_IP <- survey_p1_t2$IPAddress %in% tmp_IP

sum(table(survey_p2_t2$IPAddress) >= global_removeIPs_num)
[1] 7
tmp_IP <-
  names(table(survey_p2_t2$IPAddress))[table(survey_p2_t2$IPAddress) >= global_removeIPs_num]
sum(survey_p2_t2$IPAddress %in% tmp_IP)
[1] 50
survey_p2_t2$check_IP <- survey_p2_t2$IPAddress %in% tmp_IP


## t2 control group
sum(table(survey_p1_t2_controlGroup$IPAddress) >= global_removeIPs_num)
[1] 2
tmp_IP <-
  names(table(survey_p1_t2_controlGroup$IPAddress))[table(survey_p1_t2_controlGroup$IPAddress) >= global_removeIPs_num]
sum(survey_p1_t2_controlGroup$IPAddress %in% tmp_IP)
[1] 8
survey_p1_t2_controlGroup$check_IP <- survey_p1_t2_controlGroup$IPAddress %in% tmp_IP

sum(table(survey_p2_t2_controlGroup$IPAddress) >= global_removeIPs_num)
[1] 2
tmp_IP <-
  names(table(survey_p2_t2_controlGroup$IPAddress))[table(survey_p2_t2_controlGroup$IPAddress) >= global_removeIPs_num]
sum(survey_p2_t2_controlGroup$IPAddress %in% tmp_IP)
[1] 8
survey_p2_t2_controlGroup$check_IP <- survey_p2_t2_controlGroup$IPAddress %in% tmp_IP


rm(tmp_IP)

overview of percentage of removed data in case of “if IPs are greater / equal then 2, 3, 4” criterion:

mat <- matrix(data = NA, nrow = 4, ncol = 3)

for(i in 2:5){
  index <- i - 1
  mat[index, 1] <- paste0("greater / equal then ", i)
  # t1 part 1
  tmp_IP <-
  names(table(survey_p1_t1$IPAddress))[table(survey_p1_t1$IPAddress) >= i]
  mat[index, 2] <- round(x = sum(survey_p1_t1$IPAddress %in% tmp_IP) / nrow(survey_p1_t1) * 100, digits = 0)
  # t2 part 2
  tmp_IP <-
  names(table(survey_p2_t2$IPAddress))[table(survey_p2_t2$IPAddress) >= i]
  mat[index, 3] <- round(x = sum(survey_p2_t2$IPAddress %in% tmp_IP) / nrow(survey_p2_t2) * 100, digits = 0)
}

mat <- as.data.frame(mat)
colnames(mat) <- c("criterion", "t1", "t2")

knitr::kable(x = mat)
criterion t1 t2
greater / equal then 2 17 27
greater / equal then 3 10 15
greater / equal then 4 8 14
greater / equal then 5 7 14
rm(tmp_IP)
rm(i)
rm(index)
rm(mat)

we decided to remove participants when IPs are greater / equal then 3

draw geolocations of participants

## t1
tmp <- survey_p2_t1[!is.na(survey_p2_t1$LocationLongitude) & !is.na(survey_p2_t1$LocationLatitude), ]
mapview::mapview(tmp, xcol = "LocationLongitude", ycol = "LocationLatitude", crs = 4269, grid = FALSE)
## t2
tmp <- survey_p2_t2[!is.na(survey_p2_t2$LocationLongitude) & !is.na(survey_p2_t2$LocationLatitude), ]
mapview::mapview(tmp, xcol = "LocationLongitude", ycol = "LocationLatitude", crs = 4269, grid = FALSE)
rm(tmp)

apply filters (remove participants)

Both filters (spam, multiple IPs) are not independent:

table(survey_p1_t1$check_SPAM, survey_p1_t1$check_IP)
       
        FALSE TRUE
  FALSE  1309  308
  TRUE      3   33
chisq.test(survey_p1_t1$check_SPAM, survey_p1_t1$check_IP)

    Pearson's Chi-squared test with Yates' continuity correction

data:  survey_p1_t1$check_SPAM and survey_p1_t1$check_IP
X-squared = 109.03, df = 1, p-value < 2.2e-16

which are also related to participants, who not have finished the study:

table(survey_p1_t1$Finished, survey_p1_t1$check_SPAM)
       
        FALSE TRUE
  FALSE   149    0
  TRUE   1468   36
chisq.test(survey_p1_t1$Finished, survey_p1_t1$check_SPAM)

    Pearson's Chi-squared test with Yates' continuity correction

data:  survey_p1_t1$Finished and survey_p1_t1$check_SPAM
X-squared = 2.6089, df = 1, p-value = 0.1063
table(survey_p1_t1$Finished, survey_p1_t1$check_IP)
       
        FALSE TRUE
  FALSE   138   11
  TRUE   1174  330
chisq.test(survey_p1_t1$Finished, survey_p1_t1$check_IP)

    Pearson's Chi-squared test with Yates' continuity correction

data:  survey_p1_t1$Finished and survey_p1_t1$check_IP
X-squared = 16.672, df = 1, p-value = 4.443e-05
  1. remove participants who not have finished the study (internal variable by Qualtrics):
# head(survey_p1_t1[survey_p1_t1$Finished == TRUE & !is.na(survey_p1_t1$Finished),])
# not finished
head(survey_p1_t1[survey_p1_t1$Finished == FALSE & !is.na(survey_p1_t1$Finished),])
               StartDate             EndDate     Status       IPAddress
2463 2023-05-11 12:39:03 2023-05-11 12:40:22 IP Address 217.193.178.154
2466 2023-05-11 12:42:40 2023-05-11 12:44:48 IP Address   84.227.217.18
2468 2023-05-11 12:42:05 2023-05-11 12:46:06 IP Address       85.7.7.68
2522 2023-05-11 15:36:26 2023-05-11 15:37:30 IP Address  92.104.123.108
2531 2023-05-11 15:55:56 2023-05-11 15:59:02 IP Address  46.253.188.162
2532 2023-05-11 15:54:48 2023-05-11 15:59:29 IP Address   83.222.140.74
     Progress durationSeconds Finished LocationLatitude LocationLongitude
2463       50              78    FALSE               NA                NA
2466       93             128    FALSE               NA                NA
2468       32             241    FALSE               NA                NA
2522       36              64    FALSE               NA                NA
2531       43             186    FALSE               NA                NA
2532       46             280    FALSE               NA                NA
     consent   gender age attention.check              education
2463       1 männlich  57   Tennisspielen  Obligatorische Schule
2466       1 weiblich  45   Tennisspielen  Obligatorische Schule
2468       1 weiblich  49   Tennisspielen Hochschulreife / Matur
2522       1 weiblich  25   Tennisspielen Hochschulreife / Matur
2531       1 männlich  59   Tennisspielen  Obligatorische Schule
2532       1 weiblich  64   Tennisspielen  Obligatorische Schule
                        participantID check_SPAM check_IP
2463 388fa1466ed1681ef5491f4c54489eaa      FALSE    FALSE
2466 f8969444a32445c3a19706cf97a4f2e5      FALSE     TRUE
2468 2239e4fb23d18d507b3f11c361f59935      FALSE    FALSE
2522 8de90c183bab1b7726c0308b104b20a9      FALSE    FALSE
2531 643b01c2abfff997b32a62c4758b001c      FALSE    FALSE
2532 c90f8c199d4016bdcae5b05acc3aedb4      FALSE    FALSE
# missing data
head(survey_p1_t1[is.na(survey_p1_t1$Finished),])
     StartDate EndDate Status IPAddress Progress durationSeconds Finished
NA        <NA>    <NA>   <NA>      <NA>       NA              NA       NA
NA.1      <NA>    <NA>   <NA>      <NA>       NA              NA       NA
NA.2      <NA>    <NA>   <NA>      <NA>       NA              NA       NA
NA.3      <NA>    <NA>   <NA>      <NA>       NA              NA       NA
NA.4      <NA>    <NA>   <NA>      <NA>       NA              NA       NA
NA.5      <NA>    <NA>   <NA>      <NA>       NA              NA       NA
     LocationLatitude LocationLongitude consent gender age attention.check
NA                 NA                NA      NA   <NA>  NA            <NA>
NA.1               NA                NA      NA   <NA>  NA            <NA>
NA.2               NA                NA      NA   <NA>  NA            <NA>
NA.3               NA                NA      NA   <NA>  NA            <NA>
NA.4               NA                NA      NA   <NA>  NA            <NA>
NA.5               NA                NA      NA   <NA>  NA            <NA>
     education participantID check_SPAM check_IP
NA        <NA>          <NA>         NA    FALSE
NA.1      <NA>          <NA>         NA    FALSE
NA.2      <NA>          <NA>         NA    FALSE
NA.3      <NA>          <NA>         NA    FALSE
NA.4      <NA>          <NA>         NA    FALSE
NA.5      <NA>          <NA>         NA    FALSE
print("Number of participants who have finished the study")
[1] "Number of participants who have finished the study"
## t1
sum(survey_p1_t1$Finished, na.rm = TRUE)
[1] 1504
sum(survey_p2_t1$Finished, na.rm = TRUE)
[1] 569
## t2
sum(survey_p1_t2$Finished, na.rm = TRUE)
[1] 368
sum(survey_p2_t2$Finished, na.rm = TRUE)
[1] 325
## t2 control group
sum(survey_p1_t2_controlGroup$Finished, na.rm = TRUE)
[1] 199
sum(survey_p2_t2_controlGroup$Finished, na.rm = TRUE)
[1] 83
if (global_removeSpam) {
  print("keep only particpants who have finished the study")
  ## t1
  survey_p1_t1 <- survey_p1_t1[survey_p1_t1$Finished ,]
  survey_p2_t1 <- survey_p2_t1[survey_p2_t1$Finished ,]

  ## t2
  survey_p1_t2 <- survey_p1_t2[survey_p1_t2$Finished ,]
  survey_p2_t2 <- survey_p2_t2[survey_p2_t2$Finished ,]
  
  ## t2 control group
  survey_p1_t2_controlGroup <- survey_p1_t2_controlGroup[survey_p1_t2_controlGroup$Finished ,]
  survey_p2_t2_controlGroup <- survey_p2_t2_controlGroup[survey_p2_t2_controlGroup$Finished ,]
}
[1] "keep only particpants who have finished the study"
  1. remove participants flagged as spam (internal variable by Qualtrics):
print("Number of participants flagged as spam")
[1] "Number of participants flagged as spam"
## t1
sum(survey_p1_t1$check_SPAM, na.rm = TRUE)
[1] 36
sum(survey_p2_t1$check_SPAM, na.rm = TRUE)
[1] 56
## t2
sum(survey_p1_t2$check_SPAM, na.rm = TRUE)
[1] 46
sum(survey_p2_t2$check_SPAM, na.rm = TRUE)
[1] 0
## t2 control group
sum(survey_p1_t2_controlGroup$check_SPAM, na.rm = TRUE)
[1] 9
sum(survey_p2_t2_controlGroup$check_SPAM, na.rm = TRUE)
[1] 0
if (global_removeSpam) {
  print("participants flagged as spam are removed")
  ## t1
  survey_p1_t1 <- survey_p1_t1[!survey_p1_t1$check_SPAM ,]
  survey_p2_t1 <- survey_p2_t1[!survey_p2_t1$check_SPAM ,]

  ## t2
  survey_p1_t2 <- survey_p1_t2[!survey_p1_t2$check_SPAM ,]
  survey_p2_t2 <- survey_p2_t2[!survey_p2_t2$check_SPAM ,]
  
  ## t2 control group
  survey_p1_t2_controlGroup <- survey_p1_t2_controlGroup[!survey_p1_t2_controlGroup$check_SPAM ,]
  survey_p2_t2_controlGroup <- survey_p2_t2_controlGroup[!survey_p2_t2_controlGroup$check_SPAM ,]
}
[1] "participants flagged as spam are removed"
  1. remove participants with multiple identical IPs (if IPs are greater / equal then 3):

We assume, that multiple IPs could result of bots and at first we removed all participants who have drawn “fake CAMs” (not containing one reasonable / existent word):

print("Number of participants flagged as spam")
[1] "Number of participants flagged as spam"
## t1
round(x = (
  nrow(survey_p1_t1) - sum(survey_p1_t1$participantID %in% names(CAMdrawn_t1_clean))
) / nrow(survey_p1_t1) * 100,
digits = 0)
[1] 86
round(x = (
  nrow(survey_p2_t1) - sum(survey_p2_t1$participantID %in% names(CAMdrawn_t1_clean))
) / nrow(survey_p2_t1) * 100,
digits = 0)
[1] 14
## t2
round(x = (
  nrow(survey_p1_t2) - sum(survey_p1_t2$participantID %in% names(CAMdrawn_t2_clean))
) / nrow(survey_p1_t2) * 100,
digits = 0)
[1] 39
round(x = (
  nrow(survey_p2_t2) - sum(survey_p2_t2$participantID %in% names(CAMdrawn_t2_clean))
) / nrow(survey_p2_t2) * 100,
digits = 0)
[1] 17
## t2 control group
round(x = (
  nrow(survey_p1_t2_controlGroup) - sum(survey_p1_t2_controlGroup$participantID %in% names(CAMdrawn_t2_controlGroup_clean))
) / nrow(survey_p1_t2_controlGroup) * 100,
digits = 0)
[1] 89
round(x = (
  nrow(survey_p2_t2_controlGroup) - sum(survey_p2_t2_controlGroup$participantID %in% names(CAMdrawn_t2_controlGroup_clean))
) / nrow(survey_p2_t2_controlGroup) * 100,
digits = 0)
[1] 13
if (global_removeFakeCAMs) {
  ## t1
   survey_p1_t1 <-
    survey_p1_t1[survey_p1_t1$participantID %in% names(CAMdrawn_t1_clean), ]
     survey_p2_t1 <-
    survey_p2_t1[survey_p2_t1$participantID %in% names(CAMdrawn_t1_clean), ]
    
     ## t2
       survey_p1_t2 <-
    survey_p1_t2[survey_p1_t2$participantID %in% names(CAMdrawn_t2_clean), ]
       survey_p2_t2 <-
    survey_p2_t2[survey_p2_t2$participantID %in% names(CAMdrawn_t2_clean), ]
       
     ## t2 control group
    survey_p1_t2_controlGroup <-
    survey_p1_t2_controlGroup[survey_p1_t2_controlGroup$participantID %in% names(CAMdrawn_t2_controlGroup_clean), ]
    
      survey_p2_t2_controlGroup <-
    survey_p2_t2_controlGroup[survey_p2_t2_controlGroup$participantID %in% names(CAMdrawn_t2_controlGroup_clean), ]
}


## participants who participated multiple times have also drawn fake CAMs
sort(table(survey_p1_t1$IPAddress), decreasing = TRUE)[1:10]

195.48.220.198  195.65.139.10   31.10.147.91  84.227.217.18  146.70.99.183 
             3              3              3              3              2 
 178.196.41.39 178.197.218.38 178.197.220.22 178.238.166.59   178.38.56.43 
             2              2              2              2              2 

In the next step the remaining multiple IPs are investigated and removed if bots / participants gave identical socio-demographical information (considering “age”, “education”, “gender”):

if(global_removeIPs){
  tmp_IDs <-
  names(table(survey_p1_t1$IPAddress[survey_p1_t1$check_IP]))[table(survey_p1_t1$IPAddress[survey_p1_t1$check_IP]) >= 2] # !!!


for (i in 1:length(tmp_IDs)) {
  #### at t1
  ## temporary data set of person with multiple IPs
  tmp_dat_p1_t1 <- survey_p1_t1[survey_p1_t1$IPAddress == tmp_IDs[i], ]
  tmp_dat_p2_t1 <-
    survey_p2_t1[survey_p2_t1$participantID %in% tmp_dat_p1_t1$participantID, ]
  ## person also participated in part 2
  tmp_dat_p1_t1 <-
    tmp_dat_p1_t1[tmp_dat_p1_t1$participantID %in% tmp_dat_p2_t1$participantID, ]
  tmp_dat_p2_t1 <-
    tmp_dat_p2_t1[tmp_dat_p2_t1$participantID %in% tmp_dat_p1_t1$participantID, ]
  ## person also a matching ID in the "df_IDs" data set
  tmp_dat_p1_t1 <-
    tmp_dat_p1_t1[tmp_dat_p1_t1$participantID %in% df_IDs$participantID.wave1, ]
  tmp_dat_p2_t1 <-
    tmp_dat_p2_t1[tmp_dat_p2_t1$participantID %in% df_IDs$participantID.wave1, ]
  
  if (nrow(tmp_dat_p2_t1) > 1) {
    cat("following IP is investigated:", tmp_dat_p2_t1$IPAddres[1], "\n")
  }
  
  
  ## check compatibility of IDs
  tmp_ID <-
    df_IDs[df_IDs$participantID.wave1 %in% tmp_dat_p1_t1$participantID, ]
  
  ## check only IDs who finished study
  if (sum(tmp_ID$participantID.wave2 %in% survey_p2_t2$participantID) >= 1) {
    tmp_ID_finished <-
      tmp_ID$participantID.wave1[tmp_ID$participantID.wave2 %in% survey_p2_t2$participantID]
    tmp_ID_finished
    
    tmp_dat_p1_t1 <-
      tmp_dat_p1_t1[tmp_dat_p1_t1$participantID %in% tmp_ID_finished,]
    tmp_dat_p2_t1 <-
      tmp_dat_p2_t1[tmp_dat_p2_t1$participantID %in% tmp_ID_finished,]
    
    
    ## keep only unique data sets
    tmp_keep <-
      unique(tmp_dat_p1_t1[, c("age", "education", "gender")])
    
    ## > if less unique data sets then nrow of unique ID data set remove
    if(nrow(tmp_keep) < nrow(tmp_dat_p1_t1)){
     tmp_remove_rowname <-  rownames(tmp_dat_p1_t1)[!rownames(tmp_dat_p1_t1) %in% rownames(tmp_keep)]
     cat("following rowname is removed:", tmp_remove_rowname, "\n")

     
     survey_p1_t1 <- survey_p1_t1[rownames(survey_p1_t1) != tmp_remove_rowname, ]
    }
  }
}
  
  rm(tmp_dat_p1_t1)
  rm(tmp_dat_p2_t1)
  rm(tmp_keep)
  rm(tmp_remove_rowname)
}
following IP is investigated: 178.196.41.39 
following IP is investigated: 188.63.210.243 
following rowname is removed: 2402 
following IP is investigated: 195.202.210.20 
following IP is investigated: 195.48.220.198 
following IP is investigated: 195.65.139.10 
following rowname is removed: 1889 
following IP is investigated: 31.10.144.31 
following IP is investigated: 31.10.147.91 
following IP is investigated: 31.10.155.196 
following rowname is removed: 3326 
following IP is investigated: 84.227.217.18 
following IP is investigated: 85.7.52.58 
following rowname is removed: 1651 
following IP is investigated: 95.143.55.212 
following rowname is removed: 1682 

Merge data sets

### remove all variables not needed for final data set
remove_vars <- c( "Status", "IPAddress", "Progress", "Finished", "LocationLatitude", "LocationLongitude", "check_SPAM", "check_IP")

## t1
survey_p1_t1 <- survey_p1_t1[ , ! names(survey_p1_t1) %in% remove_vars]
survey_p2_t1 <- survey_p2_t1[ , ! names(survey_p2_t1) %in% remove_vars]

## t2
survey_p1_t2 <- survey_p1_t2[ , ! names(survey_p1_t2) %in% remove_vars]
survey_p2_t2 <- survey_p2_t2[ , ! names(survey_p2_t2) %in% remove_vars]

## t2 control group
survey_p1_t2_controlGroup <- survey_p1_t2_controlGroup[ , ! names(survey_p1_t2_controlGroup) %in% remove_vars]
survey_p2_t2_controlGroup <- survey_p2_t2_controlGroup[ , ! names(survey_p2_t2_controlGroup) %in% remove_vars]

rm(remove_vars)


### merge data sets -> part 1 + part 2
nrow(survey_p1_t1)
[1] 438
nrow(survey_p2_t1)
[1] 439
survey_t1 <- merge(x = survey_p1_t1, y = survey_p2_t1, by = c("participantID", "participantID"))
nrow(survey_t1)
[1] 432
nrow(survey_p1_t2)
[1] 258
nrow(survey_p2_t2)
[1] 271
survey_t2 <- merge(x = survey_p1_t2, y = survey_p2_t2, by = c("participantID", "participantID"))
nrow(survey_t2)
[1] 258
nrow(survey_p1_t2_controlGroup)
[1] 69
nrow(survey_p2_t2_controlGroup)
[1] 72
survey_t2_controlGroup <- merge(x = survey_p1_t2_controlGroup, y = survey_p2_t2_controlGroup, by = c("participantID", "participantID"))
nrow(survey_t2_controlGroup)
[1] 69
### check if any IDs are  not unique
## t1
tmp_t1 <- names(table(survey_t1$participantID))[table(survey_t1$participantID) >= 2]
tmp_t1
[1] "fe004c0911ba569d1d4c4190c01e4147"
## t2
tmp_t2 <- names(table(survey_t2$participantID))[table(survey_t2$participantID) >= 2]
tmp_t2
[1] "a38c17108b62d1e9a7d21907a65eec63"
## t2 control group
tmp_t2controlGroup <- names(table(survey_t2_controlGroup$participantID))[table(survey_t2_controlGroup$participantID) >= 2]
tmp_t2controlGroup
character(0)
### remove:
if(length(tmp_t1) >= 1){
  print(survey_t1[survey_t1$participantID %in% tmp_t1,])
  survey_t1 <- survey_t1[!survey_t1$participantID %in% tmp_t1, ]
}
                       participantID         StartDate.x           EndDate.x
424 fe004c0911ba569d1d4c4190c01e4147 2023-05-22 10:37:22 2023-05-22 10:42:31
425 fe004c0911ba569d1d4c4190c01e4147 2023-05-22 10:43:02 2023-05-22 10:45:10
    durationSeconds.x consent   gender age attention.check
424               309       1 weiblich  30   Tennisspielen
425               127       1 weiblich  30   Tennisspielen
                                  education         StartDate.y
424 Bachelor/ Master/ Promotion (Dr. / PhD) 2023-05-22 11:04:48
425 Bachelor/ Master/ Promotion (Dr. / PhD) 2023-05-22 11:04:48
              EndDate.y durationSeconds.y voteCo2 intendedVote ratingPositive
424 2023-05-22 11:06:19                91       1            1              8
425 2023-05-22 11:06:19                91       1            1              8
    ratingFavor politicalOrientation technical_problem technical_problem_2_TEXT
424           8                   10              Nein                     <NA>
425           8                   10              Nein                     <NA>
        representative
424 sehr repräsentativ
425 sehr repräsentativ
if(length(tmp_t2) >= 1){
  print(survey_t2[survey_t2$participantID %in% tmp_t2,])
  survey_t2 <- survey_t2[!survey_t2$participantID %in% tmp_t2, ]
}
                       participantID         StartDate.x           EndDate.x
156 a38c17108b62d1e9a7d21907a65eec63 2023-06-14 13:56:27 2023-06-14 14:01:06
157 a38c17108b62d1e9a7d21907a65eec63 2023-06-14 13:56:27 2023-06-14 14:01:06
    durationSeconds.x consent attention.check         StartDate.y
156               279       1   Tennisspielen 2023-06-14 14:16:24
157               279       1   Tennisspielen 2023-06-14 14:12:26
              EndDate.y durationSeconds.y intendedVote ratingPositive
156 2023-06-14 14:17:29                65            1              8
157 2023-06-14 14:14:26               119            1              8
    ratingFavor partyIdentification partyIdentification_text
156           8               Grüne                     <NA>
157           8               Grüne                     <NA>
              cc_concerns_1           cc_concerns_2           cc_concerns_3
156 stimme voll und ganz zu stimme voll und ganz zu stimme voll und ganz zu
157 stimme voll und ganz zu stimme voll und ganz zu stimme voll und ganz zu
              cc_concerns_4                           engagement_1
156 stimme voll und ganz zu Für circa 5-6 Stunden im letzten Monat
157 stimme voll und ganz zu Für circa 4-5 Stunden im letzten Monat
                              engagement_2
156 Für circa 3-4 Stunden im letzten Monat
157 Für circa 4-5 Stunden im letzten Monat
                              engagement_3
156 Für circa 2-3 Stunden im letzten Monat
157 Für circa 2-3 Stunden im letzten Monat
                              engagement_4
156 Für circa 2-3 Stunden im letzten Monat
157 Für circa 6-7 Stunden im letzten Monat
                              engagement_5                              media
156 Für circa 1-2 Stunden im letzten Monat     BZ (Berner Zeitung) / der Bund
157 Für circa 1-2 Stunden im letzten Monat SRF,BZ (Berner Zeitung) / der Bund
    media1_text media2_text technical_problem
156        <NA>        <NA>              Nein
157        <NA>        <NA>      Ja, und zwar
                               technical_problem_2_TEXT     representative
156                                                <NA> eher repräsentativ
157 kompliziert, wenn man dieses tool sonst nicht kennt eher repräsentativ
if(length(tmp_t2controlGroup) >= 1){
  print(survey_t2_controlGroup[survey_t2_controlGroup$participantID %in% tmp_t2controlGroup,])
  survey_t2_controlGroup <- survey_t2_controlGroup[!survey_t2_controlGroup$participantID %in% tmp_t2controlGroup, ]
}

# survey_t1[survey_t1$participantID == "fe004c0911ba569d1d4c4190c01e4147",]
# survey_t1 <- survey_t1[rownames(survey_t1) != "430", ]
# survey_t1[survey_t1$participantID == "ff515ae5174e7e81d94cbec8d4a6f578",]
# survey_t1 <- survey_t1[rownames(survey_t1) != "437", ]

# survey_t2[survey_t2$participantID == "833e71b1d892bab6ccab57d1c5ae6004",]
# survey_t2 <- survey_t2[rownames(survey_t2) != "138", ]
# survey_t2[survey_t2$participantID == "a38c17108b62d1e9a7d21907a65eec63",]
# survey_t2 <- survey_t2[rownames(survey_t2) != "166", ]



### merge data sets -> attach CAM indicators to surveys
survey_t1_CAM <- left_join(x = survey_t1, y = CAM_indicators_t1, by = c("participantID" = "participantCAM"))
survey_t2_CAM <- left_join(x = survey_t2, y = CAM_indicators_t2, by = c("participantID" = "participantCAM"))
survey_t2_controlGroup_CAM <- left_join(x = survey_t2_controlGroup, y = CAM_indicators_t2_controlGroup, by = c("participantID" = "participantCAM"))

colnames(survey_t1_CAM)[21:50] <- paste0(colnames(survey_t1_CAM)[21:50] , "_t1")
colnames(survey_t2_CAM)[30:59] <- paste0(colnames(survey_t2_CAM)[30:59] , "_t2")




### merge data sets -> attach ID data set to match t1 + t2
# for t1 it is possible (unique IDs)
nrow(df_IDs)
[1] 736
length(unique(df_IDs$participantID.wave1))
[1] 736
nrow(survey_t1_CAM)
[1] 430
survey_t1_CAM_IDs <- left_join(survey_t1_CAM, df_IDs,  by = c("participantID" = "participantID.wave1"))
nrow(survey_t1_CAM_IDs)
[1] 430
# for t2 it is not possible (not unique IDs)
nrow(df_IDs)
[1] 736
length(unique(df_IDs$participantID.wave2))
[1] 421
# !!!
nrow(survey_t2_CAM)
[1] 256
survey_t2_CAM_IDs <- left_join(survey_t2_CAM, df_IDs,  by = c("participantID" = "participantID.wave2"))
nrow(survey_t2_CAM_IDs)
[1] 406
df_IDs_t2 <- data.frame(participantID.wave1 = NA, participantID.wave2 = NA)
h = 1
n = 1
for(i in 1:nrow(survey_t2_CAM)){
  tmp <- df_IDs[df_IDs$participantID.wave2 %in% survey_t2_CAM$participantID[i],]
  
  tmp_boolean <- tmp$participantID.wave1 %in% survey_t1_CAM_IDs$participantID
  
  if(sum(tmp_boolean) == 1){
    tmp_out <- tmp[tmp_boolean,]
    df_IDs_t2[h,] <- tmp_out
    h = h + 1
  }else if(sum(tmp_boolean) == 0){
    cat("\n for ID: \"", survey_t2_CAM$participantID[i], "\" no match can be found\n")
    n = n + 1
  }else{
    cat("\n for ID: \"", survey_t2_CAM$participantID[i], "\" too many matches can be found\n")
  }
}

 for ID: " 11b32e7e9d4d6952a176e8ba561c010b " no match can be found

 for ID: " 2785fdf7076f6541968370b4cf0a2ef6 " no match can be found

 for ID: " 4877d0a043e307148049bacdd1a98bfa " no match can be found

 for ID: " 7872a5468ead7a2e7e57f694f9f1e40b " no match can be found

 for ID: " 8e80af93733288d0d86c3acd8238848c " no match can be found

 for ID: " 9d9c3466254492e700ffc5255bc7a23e " no match can be found

 for ID: " a04634868834bbf85ff4dcb2511ba6b4 " no match can be found

 for ID: " a431e6758ba5b101a90c0c44ccfb4d47 " no match can be found

 for ID: " b38424cec2ad6971a7acadffe01f3cd8 " no match can be found

 for ID: " b83c5fadc6378c90b21542d85f9a05a4 " no match can be found

 for ID: " bdbe994e4dc97483c83b01ac5e14d663 " no match can be found

 for ID: " c6e069362fa5c53b9a580a345ac83ea0 " no match can be found

 for ID: " e13a72dd0441ae3716177ac51f4a7693 " no match can be found

 for ID: " faea5e67289d065352b34ccccf82f615 " no match can be found
nrow(survey_t2_CAM)
[1] 256
survey_t2_CAM_IDs <- left_join(survey_t2_CAM, df_IDs_t2,  by = c("participantID" = "participantID.wave2"))
nrow(survey_t2_CAM_IDs)
[1] 256
## no matching IDs found for wave 1, wave 2
sum(is.na(survey_t1_CAM_IDs$participantID.wave2))
[1] 81
sum(is.na(survey_t2_CAM_IDs$participantID.wave1))
[1] 14
nrow(df_IDs_t2)
[1] 242
survey_t12_keept1 <- left_join(survey_t1_CAM_IDs, survey_t2_CAM_IDs,  by = c("participantID" = "participantID.wave1"))
nrow(survey_t12_keept1)
[1] 430
survey_t12_keept2 <- left_join(survey_t2_CAM_IDs, survey_t1_CAM_IDs,  by = c("participantID" = "participantID.wave2"))
nrow(survey_t12_keept2)
[1] 256
survey_t12_keept12 <- full_join(survey_t1_CAM_IDs, survey_t2_CAM_IDs,  by = c("participantID" = "participantID.wave1"))
nrow(survey_t12_keept12)
[1] 444
survey_t12 <- merge(survey_t1_CAM_IDs, survey_t2_CAM_IDs, by.x="participantID", by.y="participantID.wave1")
nrow(survey_t12)
[1] 242
# check for plausibility
plot(survey_t12$mean_valence_macro_t1, survey_t12$mean_valence_macro_t2)

cor(survey_t12$mean_valence_macro_t1, survey_t12$mean_valence_macro_t2)
[1] 0.665492
cor(survey_t12_keept12$mean_valence_macro_t1, survey_t12_keept12$mean_valence_macro_t2, use = "pairwise")
[1] 0.665492
sum(is.na(survey_t12_keept12$mean_valence_macro_t1))
[1] 14
sum(is.na(survey_t12_keept12$mean_valence_macro_t2))
[1] 188

save files

setwd("outputs")

## full join, keep all data
xlsx::write.xlsx2(x = survey_t12_keept12, file = "survey_t12_keept12.xlsx")
write.csv2(x = survey_t12_keept12, file = "survey_t12_keept12.csv")
saveRDS(object = survey_t12_keept12, file = "survey_t12_keept12.rds")

## merge (only keep data which can be completely matched)
xlsx::write.xlsx2(x = survey_t12, file = "survey_t12.xlsx")
write.csv2(x = survey_t12, file = "survey_t12.csv")
saveRDS(object = survey_t12, file = "survey_t12.rds")

## and save control group
xlsx::write.xlsx2(x = survey_t2_controlGroup_CAM, file = "survey_t2_controlGroup.xlsx")
write.csv2(x = survey_t2_controlGroup_CAM, file = "survey_t2_controlGroup.csv")
saveRDS(object = survey_t2_controlGroup_CAM, file = "survey_t2_controlGroup.rds")

check CAM IDs matching to merge data set and save CAM files

nrow(survey_t12)
[1] 242
length(unique(survey_t12$participantID))
[1] 242
length(unique(CAM_files_t1[[1]]$participantCAM))
[1] 575
length(unique(CAM_files_t2[[1]]$participantCAM))
[1] 322
all(survey_t12$participantID %in% CAM_files_t1[[1]]$participantCAM)
[1] TRUE
all(survey_t12$participantID.wave2 %in% CAM_files_t2[[1]]$participantCAM)
[1] TRUE
nrow(survey_t2_controlGroup_CAM)
[1] 69
length(unique(survey_t2_controlGroup_CAM$participantID))
[1] 69
length(unique(CAM_files_t2_controlGroup[[1]]$participantCAM))
[1] 83
all(survey_t2_controlGroup_CAM$participantID %in% CAM_files_t2_controlGroup[[1]]$participantCAM)
[1] TRUE
## check if all IDs are unique
names(table(c(survey_t12$participantID, survey_t12$participantID.wave2, survey_t2_controlGroup_CAM$participantID)))[table(c(survey_t12$participantID, survey_t12$participantID.wave2, survey_t2_controlGroup_CAM$participantID)) >= 2]
character(0)
### save CAMs
setwd("outputs/data CAMs")

writeLines("", "CAMdata_ClimateLaw.txt") # create file
text_connection <- file("CAMdata_ClimateLaw.txt", "a") # open connection to append


vec_ids <- c(); h = 1

## for survey_t12 - t1
for(i in 1:length(dfCAM_t1_list)){
    if(!is.null(dfCAM_t1_list[[i]]$creator) && dfCAM_t1_list[[i]]$creator %in% unique(survey_t12$participantID)){
    vec_ids[h] <- dfCAM_t1_list[[i]]$creator
    h = h + 1
    
    writeLines(jsonlite::toJSON(x =  dfCAM_t1_list[[i]]), text_connection)
  }
}


## for survey_t12 - t2
for(i in 1:length(dfCAM_t2_list)){
    if(!is.null(dfCAM_t2_list[[i]]$creator) && dfCAM_t2_list[[i]]$creator %in% unique(survey_t12$participantID.wave2)){
    vec_ids[h] <- dfCAM_t2_list[[i]]$creator
    h = h + 1
    
    writeLines(jsonlite::toJSON(x =  dfCAM_t2_list[[i]]), text_connection)
  }
}


## for survey_t2_controlGroup_CAM
for(i in 1:length(dfCAM_t2_controlGroup_list)){
    if(!is.null(dfCAM_t2_controlGroup_list[[i]]$creator) && dfCAM_t2_controlGroup_list[[i]]$creator %in% unique(survey_t2_controlGroup_CAM$participantID)){
    vec_ids[h] <- dfCAM_t2_controlGroup_list[[i]]$creator
    h = h + 1
    
    writeLines(jsonlite::toJSON(x =  dfCAM_t2_controlGroup_list[[i]]), text_connection)
  }
}



close(text_connection)



if(all(vec_ids %in% c(survey_t12$participantID, survey_t12$participantID.wave2, survey_t2_controlGroup_CAM$participantID))){
  print("all CAMs saved")
}
[1] "all CAMs saved"

References

Fenn, Julius, Florian Gouret, Michael Gorki, Paul Hüttner, Lisa Reuter, Wilhelm Gros, and Andrea Kiesel. 2023. “Cognitive-Affective Maps Extended Logic.” https://github.com/Camel-app/DataCollection.
Newman, Mark. 2018. Networks: An Introduction. Oxford University Press.
Peng, Roger D., and Elizabeth Matsui. 2016. The Art of Data Science: A Guide for Anyone Who Works with Data. Lulu.com. https://bookdown.org/rdpeng/artofdatascience/.
Skrondal, Anders, and Sophia Rabe-Hesketh. 2004. Generalized Latent Variable Modeling: Multilevel, Longitudinal, and Structural Equation Models. New York: Chapman; Hall/CRC. https://doi.org/10.1201/9780203489437.
Wickham, Hadley, and Garrett Grolemund. 2017. R for Data Science: Import, Tidy, Transform, Visualize, and Model Data. "O’Reilly Media, Inc.". https://r4ds.had.co.nz/.
Xie, Yihui, J. J. Allaire, and Garrett Grolemund. 2018. R Markdown: The Definitive Guide. New York: Chapman; Hall/CRC. https://doi.org/10.1201/9781138359444.