# load pacakages 
library(tidyverse) # used to clean, manipulate and visualise data 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the raw data file 
data <- read_csv(file = "/cloud/project/Study 8 data.csv")
## Rows: 373 Columns: 340
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (340): StartDate, EndDate, Status, IPAddress, Progress, Duration (in sec...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remove row one and two for clarity 
data <- data[-c(1, 2), ] 
# count how many participants in the raw data 
count_participants <- nrow(data) 
# there are 371 participants in original sample
count_participants
## [1] 371
# Apply exclusion critera 
# remove participants who responded twice and keep only first response
# Use Prolific_PID variable 
duplicates <- data %>%
  count(Prolific_PID) %>%
  filter(n > 1) %>%
  pull(Prolific_PID) # Will identify P's that appear more than once
data_filtered <- data %>%
  group_by(Prolific_PID) %>%
  slice(1) %>%
  ungroup() # Keep only the first occurrence that appears
# we now have 312 participants, 59 were removed 
count_participants <- nrow(data_filtered)
count_participants 
## [1] 312
# Remove participants who did not consent 
data_filtered <- data_filtered %>%
  filter(Consent == 1, na.rm = TRUE)

# remove participants who were not serious 
data_filtered <- data_filtered %>%
  filter(Serious_check == 1, na.rm = TRUE)

# remove participants who did not complete 
data_filtered <- data_filtered %>%
  filter(Finished == 1, na.rm = TRUE)

# remove participants who failed attention check 
data_filtered <- data_filtered %>%
  filter(SC0 >= 4)

# we now have 294 participants, with exclusion critera applied (18 removed)
count_participants <- nrow(data_filtered)
count_participants
## [1] 294
# Check demographics 
# Count the number of males and females
gender_counts <- data_filtered %>%
  group_by(Gender) %>%
  summarise(count = n())
# There are 126 males "1" and 168 females "2"
gender_counts
## # A tibble: 2 × 2
##   Gender count
##   <chr>  <int>
## 1 1        126
## 2 2        168
# Age 
# ensure argument is numeric 
data_filtered <- data_filtered %>%
  mutate(Age = as.numeric(as.character(Age))) 

# Find Range, Mean and Standard deviation 
# Participants were aged 18-69 
min(data_filtered$Age,na.rm=TRUE)
## [1] 18
max(data_filtered$Age,na.rm=TRUE)
## [1] 69
# Mean age was 34.29 
mean(data_filtered$Age,na.rm=TRUE)
## [1] 34.29252
# Standard deviaiton was 12.97  
sd(data_filtered$Age,na.rm=TRUE)
## [1] 12.96633