DATA CLEANING

Read in data

#load packages
library(tidyverse)
library(here)
library(yarrr) #for pirate plots

#read in raw data for experiments 1 and 2
data_exp1 <- read_csv(file = "/cloud/project/Study 8 data.csv")
data_exp2 <- read_csv(file = "/cloud/project/Study 7 data.csv")

#Clean data for Experiment 1

# Remove row one and two for clarity 
cleandata_exp1 <- data_exp1[-c(1, 2), ] 

#rename variables that don't make sense
cleandata_exp1 <- cleandata_exp1 %>%
  rename(condition = FL_10_DO,
    duration = `Duration (in seconds)`
  )

# Apply exclusion criteria 

# remove participants who responded twice and keep only first response
# Use Prolific_PID variable 
duplicates <- cleandata_exp1 %>%
  count(Prolific_PID) %>%
  filter(n > 1) %>%
  pull(Prolific_PID) # Will identify P's that appear more than once

cleandata_exp1 <- cleandata_exp1 %>%
  group_by(Prolific_PID) %>%
  slice(1) %>%
  ungroup() # Keep only the first occurrence that appears


# Remove participants who: did not consent, were not serious, did not complete
# or failed attention check
cleandata_exp1 <- cleandata_exp1 %>%
  filter(Consent == 1, Serious_check == 1, Finished == 1, SC0 >= 4, na.rm = TRUE) %>% 
  #select only relevant columns to demographics and Figure 1 and 2
  select(duration, Gender, Age, contradiction_1:condition)

#Reproduce Demographics for Experiment 1

# Check demographics 
# Count the number of males and females
gender_counts <- cleandata_exp1 %>%
  group_by(Gender) %>%
  summarise(count = n())

# There are 126 males "1" and 168 females "2"
gender_counts
# Age 
# ensure argument is numeric 
cleandata_exp1 <- cleandata_exp1 %>%
  mutate(Age = as.numeric(as.character(Age))) 

# Find Range, Mean and Standard deviation 
# Participants were aged 18-69 
min(cleandata_exp1$Age,na.rm=TRUE)
## [1] 18
max(cleandata_exp1$Age,na.rm=TRUE)
## [1] 69
# Mean age was 34.29 
mean(cleandata_exp1$Age,na.rm=TRUE)
## [1] 34.29252
# Standard deviaiton was 12.97  
sd(cleandata_exp1$Age,na.rm=TRUE)
## [1] 12.96633

Reproduce Figure 1

Preparing the data for figures took a lot of trial and error as we were still figuring out new functions and how things worked. First, we had to create a new variable that summed scores for the contradiction questions - which is what experimenter’s specified in their paper. We later found that instead of listing each contradiction DV in the select() function, you could select a range of columns using “:”.

We found trouble running this section of code due to several values in the data not being numeric - so we got some help from chatGPT and found that you could convert these automatically by mutating a new series of numeric columns using “as.numeric”.

Below is the final product:

#sum the contradiction scores  
#create a new variable called 'contradiction' that is the sum of the six contradiction ratings 

# Convert columns to numeric (since they are not already numeric)
cleandata_exp1 <- cleandata_exp1 %>% 
  mutate(across(contradiction_1:contradiction_6, as.numeric)) %>% 
# mutate new column for sum of contradiction questions 
  mutate(contradiction = rowSums(select(., contradiction_1:contradiction_6), na.rm = TRUE))

We also had to ensure that the other categories for questions (confusion and advancement) were numeric too. First line converts all values to numeric and second line with “na.omit” removes rows with missing values (aka N/A cells)

# fix non-numeric variables 
cleandata_exp1$advancement <- as.numeric(cleandata_exp1$advancement)
cleandata_exp1 <- na.omit(cleandata_exp1)

cleandata_exp1$confusion <- as.numeric(cleandata_exp1$confusion)
cleandata_exp1 <- na.omit(cleandata_exp1)

We then made a dataframe for figure 1. (explain function of group_by and summarise)

# make dataframe for FIGURE 1 PLOT 
figure1 <- cleandata_exp1 %>% 
  group_by(contradiction, condition, advancement, confusion) %>% 
  summarise(n=n())
## `summarise()` has grouped output by 'contradiction', 'condition',
## 'advancement'. You can override using the `.groups` argument.

We then seperated the IV column which looked like “Block_1_Generic_Conflict” for example (including both Conflict and Format variables in one column - we needed to split each level of IV into a seperate column since this is how x-axis is presented)

figure1 <- cleandata_exp1 %>%
  separate(col = condition, into = c("block", "number", "Format", "Conflict")) %>%
  mutate(Conflict = ifelse(Conflict == 'Conflict', 'Conf.', 
                           ifelse(Conflict == 'Consistent', 'Non.Conf.', Conflict)))

Creating our actual plot took a lot of trial and error. started with what we knew (ggplot using violin and boxplot. Basically here we are just trying to see what could work. Arent worried too much about aesthetic and format trying to understand the figure and the code used. Used ChattGPT to help fix an error and it recommended first reshaping the data into long format where each row is a single observation, and columns are used to indicate the variable and its value.

# Reshape data to long format
figure1_long <- figure1 %>%
  pivot_longer(cols = c(confusion, advancement, contradiction), 
               names_to = "Variable", 
               values_to = "Value")
# Plotting the violin plots with boxplots using ggplot2 and facet_wrap
ggplot(figure1_long, aes(x = Format, y = Value, fill = Format)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  geom_boxplot(width = 0.2, position = position_dodge(width = 0.9), outlier.shape = NA) +
  facet_grid(Variable ~ Conflict, scales = "free") +
  theme_minimal() +
  theme(legend.position = "none")

Obviously this isnt what we are looking for at all. We went to chatGPT to see if we could try and rework the code to look more like the plot.

ggplot(figure1_long, aes(x = Conflict, y = Value, fill = Format)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  geom_boxplot(width = 0.2, position = position_dodge(width = 0.9), outlier.shape = NA) +
  facet_wrap(~ Variable, scales = "free", ncol = 2, nrow = 2) +
  labs(x = "Conflict\nFormat", y = "Score") +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text.x = element_text(margin = margin(t = 10, b = 10)),
    axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 1),
    strip.background = element_blank(),
    strip.placement = "outside"
  )

We got closer here but obviously this still isnt what we are looking for at all. Figured we were probably on the wrong track and we needed to do further research to understand how the plots were done.

Learnt that researchers used pirate plot and yarrr package. Also learnt that they combined multiple plots into a single plotting area using par(mfrow())

  1. Start with basic plots to get an understanding of the data.
# Set up the plotting area
par(mfrow = c(2, 2))  # 2 rows, 2 columns

# Plot for Contradiction
pirateplot(formula = contradiction ~ Conflict * Format, 
           data = figure1, 
           inf.method = 'ci', 
           theme = 1, 
           main = "Contradiction", 
           ylab = "Perceived Contradiction")

# Plot for Advancement
pirateplot(formula = advancement ~ Conflict * Format, 
           data = figure1, 
           inf.method = 'ci', 
           theme = 1, 
           main = "Advancement", 
           ylab = "Perceived Scientific Advancement")

# Plot for Confusion
pirateplot(formula = confusion ~ Conflict * Format, 
           data = figure1, 
           inf.method = 'ci', 
           theme = 1, 
           main = "Confusion", 
           ylab = "Perceived Confusion")

  1. Then went and did all the fine-tuning required to make it look the same We also had to add “fig.height=10, fig.width=7” to the chunk header to prevent the plots from looking squashed.
# Set up the plotting area
par(mfrow = c(2, 2))  # 2 rows, 2 columns

# First plot: Contradiction
pirateplot(formula = contradiction ~ Conflict * Format, 
           data = figure1, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(0, 30), 
           theme = 1, 
           main = "Contradiction", 
           ylab = "Perceived Contradiction", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 0, to = 30, by = 5))

# Second plot: Advancement
pirateplot(formula = advancement ~ Conflict * Format, 
           data = figure1, 
           inf.method = 'ci', 
           yaxt = "n", 
           theme = 1, 
           main = "Advancement", 
           ylab = "Perceived Scientific Advancement", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = -1, to = 1, by = 1))

# Third plot: Confusion
pirateplot(formula = confusion ~ Conflict * Format, 
           data = figure1, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(1, 5), 
           theme = 1, 
           main = "Confusion", 
           ylab = "Perceived Confusion", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 1, to = 5, by = 1))

# Leave the fourth plot empty
plot.new()

#Reproduce Figure 2

For this plot we had a lot less of an issue prepping a dataset. The main troubleshooting was getting the plot to look exactly the same.

  1. Run basic ggplot
# recode condition titles 
figure2 <- cleandata_exp1 %>% 
  group_by(condition, advancement) %>% 
  summarise(n=n()) %>% 
  mutate(condition = recode(condition, 
                            'Block_1_Generic_Conflict' = 'Conflicting/Generic', 
                            'Block_3_Qualified_Conflict' = 'Conflicting/Qualified', 
                            'Block_2_Generic_Consistent' = 'Non-conflicing/Generic', 
                            'Block_4_Qualified_Consistent' = 'Non-conflicting/Qualified')) 
## `summarise()` has grouped output by 'condition'. You can override using the
## `.groups` argument.
# recode advancement titles 
figure2 <- figure2 %>% 
  mutate(advancement = recode(advancement, 
                               '-1' = 'Less', 
                                '0' = 'Same', 
                                '1' = 'More')) 
# Plot the histogram 
plot <- ggplot(figure2, aes(x = advancement, y = n, fill = condition)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  scale_fill_manual(values = c("#333333", "#818181", "#ababab", "#cccccc")) + 
  labs(x = "Advancement", y = "Number of Participants", fill = "Condition")

print(plot) 

  1. After running the code realised we needed to add some theme adjustments. We also needed to change the order of the way the variables appeared on the x-axis line.
# recode condition titles 
figure2 <- cleandata_exp1 %>% 
  group_by(condition, advancement) %>% 
  summarise(n=n()) %>% 
  mutate(condition = recode(condition, 
                            'Block_1_Generic_Conflict' = 'Conflicting/Generic',
                            'Block_3_Qualified_Conflict' = 'Conflicting/Qualified',
                            'Block_2_Generic_Consistent' = 'Non-conflicing/Generic',
                            'Block_4_Qualified_Consistent' = 'Non-conflicting/Qualified'))
## `summarise()` has grouped output by 'condition'. You can override using the
## `.groups` argument.
# recode advancement titles 
figure2 <- figure2 %>% 
  mutate(advancement = recode(advancement, 
                            '-1' = 'Less',
                            '0' = 'Same',
                            '1' = 'More'))

# Set the factor levels to ensure correct order
figure2$advancement <- factor(figure2$advancement, levels = c("Less", "Same", "More"))

# Plot the histogram
plot <- ggplot(figure2, aes(x = advancement, y = n, fill = condition)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("#333333", "#818181", "#ababab", "#cccccc")) +
  labs(x = "Advancement", y = "Number of Participants", fill = "Condition")
print(plot)

#Calculate descriptive results for Experiment 1

# List the outcome variables for study 1
study_1_outcome_variables <- c("contradiction", "confusion", "advancement")

# Create a new dataframe for study 1 with mean and standard deviation for each outcome variable
study_1_outcome_variables_df <- figure1 %>% 
  group_by(Conflict) %>% 
  # Apply mean and standard deviation functions across the specified columns
  mutate(across(any_of(study_1_outcome_variables), list(mean = ~ mean(.), sd = ~ sd(.)), .names = "{col}_{fn}")) %>%
  # Select only the newly created mean and standard deviation columns
  select(contains("mean"), contains("sd")) %>%
  # Retain only the first row of the study 1 results dataframe, because all rows will be the same
  slice(1)
## Adding missing grouping variables: `Conflict`
# Print the study 1  dataframe
study_1_outcome_variables_df
# Calculate the average duration in minutes directly
average_duration_minutes <- cleandata_exp1 %>%
  mutate(duration = as.numeric(duration)) %>%
  summarise(Average_Duration_Minutes = mean(duration, na.rm = TRUE) / 60) %>%
  pull(Average_Duration_Minutes)
print(paste("Average Duration in Minutes:", average_duration_minutes))
## [1] "Average Duration in Minutes: 8.48032879818594"

#Clean data for Experiment 2

#rename variables that don't make sense
cleandata_exp2 <- data_exp2 %>%
  rename(recall_score = SC0, condition = FL_12_DO, confidence_in_science = GSS, duration = `Duration (in seconds)`)

# Apply exclusion criteria 
# remove participants who: did not answer seriously, did not finish, did not 
# recall four or more items
cleandata_exp2 <- cleandata_exp2 %>% 
  filter(Serious_check == 1, Finished == 1, recall_score >=4) %>% 
#select only relevant columns to demographics and Figure 3
  select(duration, Gender, Age, recall_score, condition, Serious_check, Finished, NC_1:Development_sci_know_6)

#Reproduce Demographics for Experiment 2

# Check demographics 
# Count the number of males and females
gender_counts2 <- cleandata_exp2 %>%
  group_by(Gender) %>%
  summarise(count = n())

# There are 150 males "1" and 248 females "2"
gender_counts2
# Age 
# ensure argument is numeric 
cleandata_exp2 <- cleandata_exp2 %>%
  mutate(Age = as.numeric(as.character(Age))) 

# Find Range, Mean and Standard deviation 
# Participants were aged 18-73 
min(cleandata_exp2$Age,na.rm=TRUE)
## [1] 18
max(cleandata_exp2$Age,na.rm=TRUE)
## [1] 73
# Mean age was 33.46 
mean(cleandata_exp2$Age,na.rm=TRUE)
## [1] 33.465
# Standard deviaiton was 12.03  
sd(cleandata_exp2$Age,na.rm=TRUE)
## [1] 12.03415

#Reproduce Figure 3

We mainly used the same method used for Figure 1. However, we were so intent on repeating the process for Figure 1 that we didn’t realise that the experimenters averaged their scores for each category of questions this time around. this first attempt is below and the resulting graph is NOT right at all.

first_attempt <- cleandata_exp2 %>% 
  separate(col = condition, into = c("block", "number", "Format", "Conflict")) %>%
  mutate(Conflict = ifelse(Conflict == 'Conflict', 'Conf.', 
                           ifelse(Conflict == 'Consistent', 'Non.Conf.', Conflict)))


#Create new columns averaging participant answers for each category of Likert questions
# do not need an average for confience_in_science since only 1 question was asked

first_attempt <- first_attempt %>% 
  mutate(across(NC_1:Development_sci_know_6, as.numeric)) %>%  #converts all values to numeric
  mutate(nutritional_confusion = rowSums(select(., NC_1:NC_6), 
                                          na.rm = TRUE),
         nutritional_backlash = rowSums(select(., NBS_1:NBS_6), 
                                         na.rm = TRUE),
         mistrust_of_expertise = rowSums(select(., Mistrust_expertise_1:Mistrust_expertise_3),
                                          na.rm = TRUE),
         certainty_of_knowledge = rowSums(select(., Certainty_sci_know_1:Certainty_sci_know_6),
                                           na.rm = TRUE),
         development_of_knowledge = rowSums(select(., Development_sci_know_1:Development_sci_know_6), 
                                             na.rm = TRUE))


#CREATE Figure 3

# Set up the plotting area
#fig.height=12
par(mfrow = c(3, 2))  # 3 rows, 2 columns

# First plot: Nutritional Confusion
pirateplot(formula = nutritional_confusion ~ Conflict * Format, 
           data = first_attempt, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(0, 5), 
           theme = 1, 
           main = "Nutritional Confusion", 
           ylab = "Nutritional Confusion", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 0, to = 5, by = 1))

# Second plot: Nutritional Backlash
pirateplot(formula = nutritional_backlash ~ Conflict * Format, 
           data = first_attempt, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(0, 5), 
           theme = 1, 
           main = "Nutritional Backlash", 
           ylab = "Nutritional Backlash", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 0, to = 5, by = 1))

# Third plot: Mistrust of Expertise
pirateplot(formula = mistrust_of_expertise ~ Conflict * Format, 
           data = first_attempt, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(0, 5), 
           theme = 1, 
           main = "Mistrust of Expertise", 
           ylab = "Mistrust of Expertise", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 0, to = 5, by = 1))

# Fourth plot: Confidence in Scientific Community
pirateplot(formula = confidence_in_science ~ Conflict * Format, 
           data = first_attempt, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(0, 3), 
           theme = 1, 
           main = "Confidence in Scientific Community", 
           ylab = "Confidence in Scientific Community", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 0, to = 3, by = 1))

# Fifth plot: Certainty of Knowledge
pirateplot(formula = certainty_of_knowledge ~ Conflict * Format, 
           data = first_attempt, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(0, 5), 
           theme = 1, 
           main = "Certainty of Knowledge", 
           ylab = "Certainty of Knowledge", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 0, to = 5, by = 1))

# Sixth plot: Development of Knowledge
pirateplot(formula = development_of_knowledge ~ Conflict * Format, 
           data = first_attempt, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(0, 5), 
           theme = 1, 
           main = "Development of Knowledge", 
           ylab = "Development of Knowledge", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 0, to = 5, by = 1))

#PREPARE DATA for Figure 3



figure3 <- cleandata_exp2 %>% 
  separate(col = condition, into = c("block", "number", "Format", "Conflict")) %>%
  mutate(Conflict = ifelse(Conflict == 'Conflict', 'Conf.', 
                           ifelse(Conflict == 'Consistent', 'Non.Conf.', Conflict)))


#Create new columns averaging participant answers for each category of Likert questions
# do not need an average for confience_in_science since only 1 question was asked

figure3 <- figure3 %>% 
  mutate(across(NC_1:Development_sci_know_6, as.numeric)) %>%  #converts all values to numeric
  mutate(nutritional_confusion = rowMeans(select(., NC_1:NC_6), 
                                          na.rm = TRUE),
         nutritional_backlash = rowMeans(select(., NBS_1:NBS_6), 
                                         na.rm = TRUE),
         mistrust_of_expertise = rowMeans(select(., Mistrust_expertise_1:Mistrust_expertise_3),
                                          na.rm = TRUE),
         certainty_of_knowledge = rowMeans(select(., Certainty_sci_know_1:Certainty_sci_know_6),
                                           na.rm = TRUE),
         development_of_knowledge = rowMeans(select(., Development_sci_know_1:Development_sci_know_6), 
                                             na.rm = TRUE))


#CREATE Figure 3

# Set up the plotting area
#fig.height=12
par(mfrow = c(3, 2))  # 3 rows, 2 columns

# First plot: Nutritional Confusion
pirateplot(formula = nutritional_confusion ~ Conflict * Format, 
           data = figure3, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(1, 5), 
           theme = 1, 
           main = "Nutritional Confusion", 
           ylab = "Nutritional Confusion", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 1, to = 5, by = 1))

# Second plot: Nutritional Backlash
pirateplot(formula = nutritional_backlash ~ Conflict * Format, 
           data = figure3, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(1, 5), 
           theme = 1, 
           main = "Nutritional Backlash", 
           ylab = "Nutritional Backlash", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 1, to = 5, by = 1))

# Third plot: Mistrust of Expertise
pirateplot(formula = mistrust_of_expertise ~ Conflict * Format, 
           data = figure3, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(1, 5), 
           theme = 1, 
           main = "Mistrust of Expertise", 
           ylab = "Mistrust of Expertise", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 1, to = 5, by = 1))

# Fourth plot: Confidence in Scientific Community
pirateplot(formula = confidence_in_science ~ Conflict * Format, 
           data = figure3, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(1, 3), 
           theme = 1, 
           main = "Confidence in Scientific Community", 
           ylab = "Confidence in Scientific Community", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 1, to = 3, by = 1))

# Fifth plot: Certainty of Knowledge
pirateplot(formula = certainty_of_knowledge ~ Conflict * Format, 
           data = figure3, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(1, 5), 
           theme = 1, 
           main = "Certainty of Knowledge", 
           ylab = "Certainty of Knowledge", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 1, to = 5, by = 1))

# Sixth plot: Development of Knowledge
pirateplot(formula = development_of_knowledge ~ Conflict * Format, 
           data = figure3, 
           inf.method = 'ci', 
           yaxt = "n", 
           ylim = c(1, 5), 
           theme = 1, 
           main = "Development of Knowledge", 
           ylab = "Development of Knowledge", 
           cex.names = 0.75, 
           cex.lab = 0.9)
axis(2, at = seq(from = 1, to = 5, by = 1))

#Calculate descriptive results for Experiment 2

# List the outcome variables for study 2
study_2_outcome_variables <- c("nutritional_confusion", "nutritional_backlash", "mistrust_of_expertise", "certainty_of_knowledge", "development_of_knowledge")

# Create a new dataframe for study 2 with mean and standard deviation for each outcome variable
study_2_outcome_variables_df <- figure3 %>%
    group_by(Format, Conflict) %>% 
  # Apply mean and standard deviation functions across the specified columns
  mutate(across(any_of(study_2_outcome_variables), list(mean = ~ mean(.), sd = ~ sd(.)), .names = "{col}_{fn}")) %>%
  # Select only the newly created mean and standard deviation columns
  select(contains("mean"), contains("sd")) %>%
  # Retain only the first row of the resulting dataframe, because all rows will be the same
  slice(1)
## Adding missing grouping variables: `Format`, `Conflict`
# Print the study 2 dataframe
study_2_outcome_variables_df
# mean duration 

# Calculate the average duration in minutes directly
average_duration_minutes_2 <- cleandata_exp2 %>%
  mutate(duration = as.numeric(duration)) %>%
  summarise(Average_Duration_Minutes = mean(duration, na.rm = TRUE) / 60) %>%
  pull(Average_Duration_Minutes)
print(paste("Average Duration in Minutes:", average_duration_minutes_2))
## [1] "Average Duration in Minutes: 10.8946666666667"