Coding goals for week 4

Challenges/successes

#Loading packages/Data
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.4     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
mydata <- read_csv("Study_1_data.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

And here is where our data wrangling begins! 1. Filtering Data to arrive at a tidied version (this step was pretty straight forward!)

# Rows 2 and3 in the data set are filled with text and not relevant data, so we removed them (Jia found some lovely code to do this)
mydata <- mydata %>% 
  slice(-1:-2)

# Renaming variables "SC0" to "recall_score" and "FL_10_DO" to "condition"
mydata <- mydata %>% rename(Recall_score = SC0, Condition = FL_10_DO)

# Some participants completed the study twice, thus there are duplicated IDs (Profilic_PID variable) that must be filtered out
mydata %>% count(duplicated(Prolific_PID)) #count duplicates
## # A tibble: 2 x 2
##   `duplicated(Prolific_PID)`     n
##   <lgl>                      <int>
## 1 FALSE                        312
## 2 TRUE                          59
mydata %>% summarise(duplicated(Prolific_PID)) #identify duplicates
## # A tibble: 371 x 1
##    `duplicated(Prolific_PID)`
##    <lgl>                     
##  1 FALSE                     
##  2 FALSE                     
##  3 FALSE                     
##  4 FALSE                     
##  5 FALSE                     
##  6 FALSE                     
##  7 FALSE                     
##  8 FALSE                     
##  9 FALSE                     
## 10 FALSE                     
## # … with 361 more rows
mydata %>% distinct(Prolific_PID) #Identify the distinct IDs (312) - these should remain after removing duplicates
## # A tibble: 312 x 1
##    Prolific_PID            
##    <chr>                   
##  1 5ea9f5919b12c2176f64927c
##  2 5e70f7033953db048bc49caf
##  3 5e2a086c2927f40710bdc706
##  4 5ec6d7347ff48e101440546b
##  5 5a6bb6b79d65ec00017e825c
##  6 5dd7abaaaf16d3746667182c
##  7 5cd1836da6f34300017e240c
##  8 5e7112783f1bd301e39e60a2
##  9 5c8a6e46926c0b0011b9606a
## 10 5d62b2186f363200168bbb85
## # … with 302 more rows
mydata <- mydata[!duplicated(mydata$Prolific_PID), ] ##removes the second (duplicate) response  
mydata %>% summarise(n()) #No. of obs. (312)
## # A tibble: 1 x 1
##   `n()`
##   <int>
## 1   312
# Applying exclusion criteria (participants who finished the study (Finished==1), declared that they answered seriously (seriousness_check==1) AND scored 4 or above on recall) ***Filtering for 'Finished' excludes those who didn't consent automatically
# THEN creating a subset of relevant variables 
mydata <- mydata %>% 
    filter(
      Finished == 1,
      Serious_check == 1,
      Recall_score >= 4) %>% 
    select(
      Finished,
      `Duration (in seconds)`,
      Gender,
      Age,
      Serious_check, 
      Recall_score,
      Condition,
      contradiction_1:advancement)

#Final sample size = 294
mydata %>% summarise(n())
## # A tibble: 1 x 1
##   `n()`
##   <int>
## 1   294
#Exporting the tidied data to a .csv
write_csv(mydata, "MyDataSubset.csv")

##For contradiction plot, must create a NEW variable: sum the contradiction scores (sum of the six contradiction ratings)
#FIRST Change from Chr to Numberic for all contradiction variables (summing the contradiction ratings didn't seem to work when they were Characters)
mydata$contradiction_1 <- as.numeric(mydata$contradiction_1)
mydata$contradiction_2 <- as.numeric(mydata$contradiction_2)
mydata$contradiction_3 <- as.numeric(mydata$contradiction_3)
mydata$contradiction_4 <- as.numeric(mydata$contradiction_4)
mydata$contradiction_5 <- as.numeric(mydata$contradiction_5)
mydata$contradiction_6 <- as.numeric(mydata$contradiction_6)

library("dplyr") #Lauren
mydata <- mydata %>%
  rowwise() %>%
  mutate(contradiction = sum(contradiction_1, contradiction_2, contradiction_3, contradiction_4, contradiction_5, contradiction_6))

##For all plots
#Separate the data in Condition into 4 columns to separate levels of each IV  ("Block_1_Generic_Conflict"...)
mydata <- separate(mydata, Condition, c("block", "number", "Format", "Conflict"))

#rename (to mimic the labels in the plots)
levels(mydata$Conflict)[levels(mydata$Conflict)=="Conflict"] <- "Conf."
levels(mydata$Conflict)[levels(mydata$Conflict)=="Consistent"] <- "Non-Conf."

#set these new IV columns as factors 
mydata <- mydata %>%
  mutate(Format=as.factor(Format)) %>%
  mutate(Conflict=as.factor(Conflict)) 

sapply(mydata, class) #to check
##              Finished Duration (in seconds)                Gender 
##           "character"           "character"           "character" 
##                   Age         Serious_check          Recall_score 
##           "character"           "character"           "character" 
##                 block                number                Format 
##           "character"           "character"              "factor" 
##              Conflict       contradiction_1       contradiction_2 
##              "factor"             "numeric"             "numeric" 
##       contradiction_3       contradiction_4       contradiction_5 
##             "numeric"             "numeric"             "numeric" 
##       contradiction_6             confusion           advancement 
##             "numeric"           "character"           "character" 
##         contradiction 
##             "numeric"
write_csv(mydata, "MyDataTidiedSubset.csv")

2. Descriptive stats > Calculating demographics (Exp 1)

#Age 
mydata %>% summarise(mean(Age), sd(Age), range(Age)) # M = 34.29, SD = 12.67, Range = 18-69
## Warning in mean.default(Age): argument is not numeric or logical: returning NA
## # A tibble: 2 x 3
##   `mean(Age)` `sd(Age)` `range(Age)`
##         <dbl>     <dbl> <chr>       
## 1          NA      13.0 18          
## 2          NA      13.0 69
#GENDER
#No. of males(Value: '1') and females('2'), Other('3') and Prefer not to say('4'):
Males <- mydata %>% count(Gender==1) #n=126
  Males #Print
## # A tibble: 2 x 2
##   `Gender == 1`     n
##   <lgl>         <int>
## 1 FALSE           168
## 2 TRUE            126
Females <- mydata %>% count(Gender==2) #n=168
  Females #Print
## # A tibble: 2 x 2
##   `Gender == 2`     n
##   <lgl>         <int>
## 1 FALSE           126
## 2 TRUE            168
mydata %>% count(Gender==3) #n=0
## # A tibble: 1 x 2
##   `Gender == 3`     n
##   <lgl>         <int>
## 1 FALSE           294
mydata %>% count(Gender==4) #n=0
## # A tibble: 1 x 2
##   `Gender == 4`     n
##   <lgl>         <int>
## 1 FALSE           294

3. Descriptive stats > Descriptive ggplots (Exp 1)

CHALLENGE: producing violin plots that are close to the ones in the paper is proving to be extremely difficult. Initial attempts saw me reproducing ‘violins’ that were stacked on top of eachother and not side by side, and changing the axis to reflect appropriate IVs and DVs was tricky. Within our team, we shared our ideas and helped eachother create more similar plots to those in the paper, however we still have a long way to go in terms of plot aesthetics and adding more descriptive features (means and confidence intervals).

CHALLENGE: The scale on the contradiction plot in the paper starts from ‘0’ not 5 (what I produced) –> I tried changing the y axis scale (0 - 30) and label (“Perceived Contradiction”) simultaneously but had no luck, until my group helped me rearrange my code (thanks guys):

#Contradiction plot - My faulty code:
ggplot(mydata) +
  geom_violin(aes(x = Conflict, y = contradiction)) + 
  facet_wrap(vars(Format), strip.position = "bottom")+
  ggtitle(label = "Contradiction")+
  scale_y_continuous(name = "Perceived Contradiction")+
  ylim(0,30)+
  scale_x_discrete(name = NULL)+
  theme(plot.title = element_text(hjust = 0.5))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

# Issue resolved
ggplot(mydata) +
  geom_violin(aes(x = Conflict, y = contradiction)) + 
  facet_wrap(vars(Format), strip.position = "bottom")+
  ggtitle(label = "Contradiction")+
  scale_y_continuous(
    name = "Perceived Contradiction",
    limits = c(0,30))+
  scale_x_discrete(name = NULL)+
  theme(plot.title = element_text(hjust = 0.5))

# Advancement plot
mydata$advancement <- as.numeric(mydata$advancement) #To change variable from Character to Numeric
ggplot(mydata) +
  geom_violin(aes(x = Conflict, y = advancement)) + 
  facet_wrap(vars(Format), strip.position = "bottom")+
  ggtitle(label = "Advancement")+
  scale_y_continuous(name = "Perceived Scientific Advancement")+
  scale_x_discrete(name = NULL)+
  theme(plot.title = element_text(hjust = 0.5))

# Confusion plot
mydata$confusion <- as.numeric(mydata$confusion) #To change variable from Character to Numeric
ggplot(mydata) +
  geom_violin(aes(x = Conflict, y = confusion)) + 
  facet_wrap(vars(Format), strip.position = "bottom")+
  ggtitle(label = "Confusion")+
  scale_y_continuous(name = "Perceived Confusion")+
  scale_x_discrete(name = NULL)+
  theme(plot.title = element_text(hjust = 0.5))

The next steps