This is my analysis of Nicky Case’s online experiment on the generation effect.
# Load the necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(ggpubr)
library(ggplotify)
library(ggthemes)
# Load the data
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
learnExp<- getURL("https://raw.githubusercontent.com/cdromcom/inst314jp/refs/heads/main/ncase-generation-effect-data-clean.csv")
geneff <- read.csv(text = learnExp)
# Check the structure of the data
str(geneff)
## 'data.frame': 1620 obs. of 18 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ timestamp : chr "2019/01/10 11:13:00 AM EST" "2019/01/10 11:32:59 AM EST" "2019/01/10 11:33:55 AM EST" "2019/01/10 11:34:50 AM EST" ...
## $ likeable : int 7 4 3 7 7 5 7 6 8 3 ...
## $ surprising: int 3 5 3 3 7 4 5 3 6 6 ...
## $ q1 : num -88 -30 -85 -88 -86 -50 -80 -50 -88 88 ...
## $ q2 : int 15 30 20 30 32 20 -20 300 25 25 ...
## $ q3 : num 1 1 3 1 1 1 1 1 1 1 ...
## $ q4 : num 15 14 35 15 26 40 20 15 15 15 ...
## $ q5 : num 33 10 36 30 32 30 60 15 32 32 ...
## $ q6 : int -80 -50 -86 -88 -36 -66 -84 85 94 88 ...
## $ q7 : int -33 -5 100 -33 16 -30 -40 -30 -33 55 ...
## $ q8 : num 58 51 50 56 56 60 80 56 56 53 ...
## $ q9 : num -50 -30 -30 -51 -17 -40 -30 -15 -52 -50 ...
## $ q10 : int 33 10 300 300 40 20 -60 300 302 302 ...
## $ age : int 27 18 21 45 17 25 38 32 17 23 ...
## $ gender : chr "m" "m" "m" "m" ...
## $ edu : chr "4yr" "high" "high" "4yr" ...
## $ group : int 0 0 0 0 0 0 0 0 0 0 ...
# Check the first few rows of the data
head(geneff)
# Check the number of rows and columns
dim(geneff)
## [1] 1620 18
# Check the column names
colnames(geneff)
## [1] "X" "timestamp" "likeable" "surprising" "q1"
## [6] "q2" "q3" "q4" "q5" "q6"
## [11] "q7" "q8" "q9" "q10" "age"
## [16] "gender" "edu" "group"
# Check the unique values in the 'condition' column
unique(geneff$condition)
## NULL
# Check the unique values in the 'item' column
unique(geneff$item)
## NULL
# Check the unique values in the 'response' column
unique(geneff$response)
## NULL
# Check the unique values in the 'correct' column
unique(geneff$correct)
## NULL
# Check the unique values in the 'rt' column
unique(geneff$rt)
## NULL
summary(geneff)
## X timestamp likeable surprising
## Min. : 0.0 Length:1620 Min. : 1.00 Min. : 1.000
## 1st Qu.: 404.8 Class :character 1st Qu.: 5.00 1st Qu.: 4.000
## Median : 809.5 Mode :character Median : 7.00 Median : 6.000
## Mean : 809.5 Mean : 6.58 Mean : 5.351
## 3rd Qu.:1214.2 3rd Qu.: 8.00 3rd Qu.: 7.000
## Max. :1619.0 Max. :10.00 Max. :10.000
##
## q1 q2 q3 q4
## Min. :-300.00 Min. : -88.0 Min. : -30 Min. : -15.00
## 1st Qu.: -88.00 1st Qu.: 25.0 1st Qu.: 1 1st Qu.: 15.00
## Median : -80.00 Median : 30.0 Median : 1 Median : 15.00
## Mean : -57.46 Mean : 676.8 Mean : 1236 Mean : 18.53
## 3rd Qu.: -47.50 3rd Qu.: 51.0 3rd Qu.: 1 3rd Qu.: 16.00
## Max. : 98.00 Max. :1001001.0 Max. :2000000 Max. :1512.00
##
## q5 q6 q7 q8
## Min. : -36.00 Min. :-1000.00 Min. :-300.00 Min. : 0.00
## 1st Qu.: 28.00 1st Qu.: -84.00 1st Qu.: -33.00 1st Qu.:53.00
## Median : 32.00 Median : -80.00 Median : -30.00 Median :56.00
## Mean : 33.12 Mean : -55.57 Mean : -21.36 Mean :54.46
## 3rd Qu.: 33.00 3rd Qu.: -50.00 3rd Qu.: -10.00 3rd Qu.:56.00
## Max. :5625.00 Max. : 9001.00 Max. : 130.00 Max. :98.00
##
## q9 q10 age gender
## Min. :-200.00 Min. :-302.0 Min. :11.00 Length:1620
## 1st Qu.: -51.00 1st Qu.: 200.0 1st Qu.:23.00 Class :character
## Median : -50.00 Median : 300.0 Median :29.00 Mode :character
## Mean : -35.89 Mean : 241.7 Mean :31.32
## 3rd Qu.: -30.00 3rd Qu.: 302.0 3rd Qu.:37.00
## Max. : 302.00 Max. :3000.0 Max. :80.00
## NA's :129
## edu group
## Length:1620 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :1.0000
## Mean :0.5006
## 3rd Qu.:1.0000
## Max. :1.0000
##
# Create summary tables of each variable
summary(geneff$condition)
## Length Class Mode
## 0 NULL NULL
# Create summary tables of each variable
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(geneff, type = "text", title = "Summary Statistics", digits = 2)
##
## Summary Statistics
## ========================================================
## Statistic N Mean St. Dev. Min Max
## --------------------------------------------------------
## X 1,620 809.50 467.80 0 1,619
## likeable 1,620 6.58 2.17 1 10
## surprising 1,620 5.35 2.06 1 10
## q1 1,620 -57.46 54.01 -300.00 98.00
## q2 1,620 676.82 24,868.76 -88 1,001,001
## q3 1,620 1,236.40 49,690.35 -30.00 2,000,000.00
## q4 1,620 18.53 38.69 -15.00 1,512.00
## q5 1,620 33.11 139.51 -36.00 5,625.00
## q6 1,620 -55.57 237.12 -1,000 9,001
## q7 1,620 -21.36 29.83 -300 130
## q8 1,620 54.46 10.07 0.00 98.00
## q9 1,620 -35.89 32.80 -200.00 302.00
## q10 1,620 241.67 141.08 -302 3,000
## age 1,491 31.32 11.42 11 80
## group 1,620 0.50 0.50 0 1
## --------------------------------------------------------
# Check the unique values in the 'group' column
unique(geneff$group)
## [1] 0 1
# Check the group variable
class(geneff$group)
## [1] "integer"
# factor the group variable
geneff$group <- factor(geneff$group, levels = c("0", "1"), labels = c("Control", "Generation"))
# Check the group variable again
class(geneff$group)
## [1] "factor"
levels(geneff$group)
## [1] "Control" "Generation"
unique(geneff$group)
## [1] Control Generation
## Levels: Control Generation
unique(geneff$group)
## [1] Control Generation
## Levels: Control Generation
# Check the unique values in the age column
unique(geneff$age)
## [1] 27 18 21 45 17 25 38 32 23 19 41 20 26 28 31 36 22 40 NA 34 24 37 39 29 43
## [26] 33 30 35 48 51 44 52 59 15 54 42 47 56 49 71 67 75 16 46 57 68 64 58 72 11
## [51] 14 53 50 60 65 55 62 61 63 13 12 69 80 66
# Drop the NA values in the age column
geneff <- geneff %>% filter(!is.na(age))
# Check the unique values in the age column again
unique(geneff$age)
## [1] 27 18 21 45 17 25 38 32 23 19 41 20 26 28 31 36 22 40 34 24 37 39 29 43 33
## [26] 30 35 48 51 44 52 59 15 54 42 47 56 49 71 67 75 16 46 57 68 64 58 72 11 14
## [51] 53 50 60 65 55 62 61 63 13 12 69 80 66
# Create a ridgeline density plot of the ages
library(ggridges)
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
ggplot(geneff, aes(x = age, y = group, fill = group)) +
geom_density_ridges() +
#scale_fill_viridis_d(option = "A") +
coord_cartesian(clip = "off")+
ggtitle("Ages (in years) of Online Participants by Experimental Condition") +
#xlab("Age") +
#ylab("Group") +
#theme_ridges()
# scale x-axis from 0 to 100
scale_x_continuous(limits = c(0, 100)) +
# use theme_minimal and remove the legend
theme_minimal() +
theme(legend.position = "none")
## Picking joint bandwidth of 2.59
# Create a ridgeline density plot of the likeability by condition
ggplot(geneff, aes(x = likeable, y = group, fill = group)) +
geom_density_ridges() +
#scale_fill_viridis_d(option = "A") +
coord_cartesian(clip = "off")+
ggtitle("Likeability ratings of the study by condition") +
#xlab("Age") +
#ylab("Group") +
#theme_ridges()
# scale x-axis from 0 to 10
scale_x_continuous(limits = c(0, 12)) +
# use theme_minimal and remove the legend
theme_minimal() +
theme(legend.position = "none")
## Picking joint bandwidth of 0.445
# Create a ridgeline density plot of the surprise by condition
ggplot(geneff, aes(x = surprising, y = group, fill = group)) +
geom_density_ridges() +
#scale_fill_viridis_d(option = "A") +
coord_cartesian(clip = "off")+
ggtitle("Likeability ratings of the study by condition") +
#xlab("Age") +
#ylab("Group") +
#theme_ridges()
# scale x-axis from 0 to 10
scale_x_continuous(limits = c(0, 12)) +
# use theme_minimal and remove the legend
theme_minimal() +
theme(legend.position = "none")
## Picking joint bandwidth of 0.479
# Summary the q1-q10 values in a sum score and store than in a new column called avg_score
geneff <- geneff %>%
mutate(avg_score = rowMeans(select(., starts_with("q")), na.rm = TRUE))
# Create a ridgeline density plot of the avg_score by condition
ggplot(geneff, aes(x = avg_score, y = group, fill = group)) +
geom_density_ridges() +
#scale_fill_viridis_d(option = "A") +
coord_cartesian(clip = "off")+
ggtitle("Distributions of average scores by condition") +
#xlab("Age") +
#ylab("Group") +
#theme_ridges()
# scale x-axis from 0 to 100
scale_x_continuous(limits = c(-100, 100)) +
# use theme_minimal and remove the legend
theme_minimal() +
theme(legend.position = "none")
## Picking joint bandwidth of 2.9
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
# Create overlapping ridgeline density plots of the avg_score by condition
ggplot(geneff, aes(x = avg_score, y = group, fill = group)) +
geom_density_ridges(alpha = 0.5) +
#scale_fill_viridis_d(option = "A") +
coord_cartesian(clip = "off")+
ggtitle("Distributions of average scores by condition") +
#xlab("Age") +
#ylab("Group") +
#theme_ridges()
# scale x-axis from 0 to 100
scale_x_continuous(limits = c(-100, 100)) +
# use theme_minimal and remove the legend
theme_minimal() +
theme(legend.position = "none")
## Picking joint bandwidth of 2.9
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
# Create a raincloud plot of the new avg_score variable by group
library(ggplot2)
library(ggdist)
##
## Attaching package: 'ggdist'
## The following objects are masked from 'package:ggridges':
##
## scale_point_color_continuous, scale_point_color_discrete,
## scale_point_colour_continuous, scale_point_colour_discrete,
## scale_point_fill_continuous, scale_point_fill_discrete,
## scale_point_size_continuous
ggplot(geneff, aes(x = avg_score, y = group)) +
# use geom_stat_halfeye to create the raincloud plot
geom_slab(aes(fill = group), alpha = 0.5) +
#geom_half_violin(aes(fill = group), side = "r", alpha = 0.5) +
geom_point(aes(color = group), position = position_jitter(width = 0.1), size = 2) +
geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.5) +
scale_fill_manual(values = c("#F8766D", "#00BFC4")) +
scale_color_manual(values = c("#F8766D", "#00BFC4")) +
theme_minimal() +
# y-axis scale between 0 and 10
scale_x_continuous(limits = c(-100, 100)) +
labs(title = "Raincloud plot of avg_score by group",
x = "Average Score")
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
theme(legend.position = "none")
## List of 1
## $ legend.position: chr "none"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
library(beeswarm)
# Create a histogram of the q1 variable
ggplot(geneff, aes(x = q1)) +
geom_histogram(binwidth = 10, fill = "maroon", color = "maroon", alpha = 0.5) +
labs(title = "Histogram of Question 1",
x = "Average Score",
y = "Count") +
# scale x-axis from -200 to 200
scale_x_continuous(limits = c(-300, 300)) +
facet_wrap(~ group) +
theme_minimal()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Use a chi-squared test to determine whether where were more people in the control group or the generation group
geneff %>%
group_by(group) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(prop = n / sum(n)) %>%
select(group, prop)
# summarize the gender variable
summary(geneff$gender)
## Length Class Mode
## 1491 character character
class(geneff$gender)
## [1] "character"
#factor gender
geneff$gender <- as.factor(geneff$gender)
# check gender
class(geneff$gender)
## [1] "factor"
unique(geneff$gender)
## [1] m none f nb
## Levels: f m nb none
# Summarize the gender variable
geneff %>%
group_by(gender) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(prop = n / sum(n)) %>%
select(gender, prop)
# Create a crosstabulation of gender and group
library(gmodels)
CrossTable(geneff$gender, geneff$group, prop.chisq = TRUE, prop.t = FALSE, prop.r = FALSE, prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## |-------------------------|
##
##
## Total Observations in Table: 1491
##
##
## | geneff$group
## geneff$gender | Control | Generation | Row Total |
## --------------|------------|------------|------------|
## f | 123 | 140 | 263 |
## | 0.341 | 0.331 | |
## --------------|------------|------------|------------|
## m | 529 | 531 | 1060 |
## | 0.080 | 0.078 | |
## --------------|------------|------------|------------|
## nb | 17 | 25 | 42 |
## | 0.663 | 0.644 | |
## --------------|------------|------------|------------|
## none | 66 | 60 | 126 |
## | 0.243 | 0.237 | |
## --------------|------------|------------|------------|
## Column Total | 735 | 756 | 1491 |
## --------------|------------|------------|------------|
##
##
# Run a chi-squared test of independence for gender and group
chisq.test(geneff$gender, geneff$group)
##
## Pearson's Chi-squared test
##
## data: geneff$gender and geneff$group
## X-squared = 2.6169, df = 3, p-value = 0.4545
# Check education variable
class(geneff$education)
## [1] "NULL"
unique(geneff$education)
## NULL
head(geneff$education)
## NULL
names(geneff)
## [1] "X" "timestamp" "likeable" "surprising" "q1"
## [6] "q2" "q3" "q4" "q5" "q6"
## [11] "q7" "q8" "q9" "q10" "age"
## [16] "gender" "edu" "group" "avg_score"
# Factor the education variable and rename the levels
geneff$edu <- factor(geneff$edu, levels = c("grade", "high", "2yr", "4yr", "masters", "phd"))
# Check the education variable again
class(geneff$edu)
## [1] "factor"
levels(geneff$edu)
## [1] "grade" "high" "2yr" "4yr" "masters" "phd"
# Summarize the education variable
geneff %>%
group_by(edu) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(prop = n / sum(n)) %>%
select(edu, prop)
# Check assumptions for an independent samples t-test
library(performance)
geneffTest <- lm(avg_score ~ group, data = geneff)
check_model(geneffTest)
plot(geneffTest)
# Create a summary table of geneffTest using stargazer
stargazer(geneffTest, type = "text", title = "Summary of geneffTest", digits = 2)
##
## Summary of geneffTest
## ===============================================
## Dependent variable:
## ---------------------------
## avg_score
## -----------------------------------------------
## groupGeneration 125.88
## (134.28)
##
## Constant 27.21
## (95.61)
##
## -----------------------------------------------
## Observations 1,491
## R2 0.001
## Adjusted R2 -0.0001
## Residual Std. Error 2,592.16 (df = 1489)
## F Statistic 0.88 (df = 1; 1489)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Run an independent samples t-test
geneffTTest <- t.test(avg_score ~ group, data = geneff, var.equal = TRUE)
geneffTTest
##
## Two Sample t-test
##
## data: avg_score by group
## t = -0.9375, df = 1489, p-value = 0.3487
## alternative hypothesis: true difference in means between group Control and group Generation is not equal to 0
## 95 percent confidence interval:
## -389.2713 137.5056
## sample estimates:
## mean in group Control mean in group Generation
## 27.21218 153.09505
normcheck <- check_normality(geneff$avg_score)
normcheck
## Warning: Non-normality of raw detected (p < .001).
#check_outliers(geneff$avg_score)
check_collinearity(geneff$avg_score)
## Could not extract the variance-covariance matrix for the conditional
## component of the model. Please try to run `vcov(model)`, which may help
## identifying the problem.
## NULL
# Run a one-way ANOVA predicting avg_score from group
anovaTest <- aov(avg_score ~ group, data = geneff)
summary(anovaTest)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 5.906e+06 5905610 0.879 0.349
## Residuals 1489 1.000e+10 6719274
glimpse(geneff)
## Rows: 1,491
## Columns: 19
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ timestamp <chr> "2019/01/10 11:13:00 AM EST", "2019/01/10 11:32:59 AM EST",…
## $ likeable <int> 7, 4, 3, 7, 7, 5, 7, 6, 8, 3, 7, 6, 7, 3, 3, 4, 7, 6, 7, 7,…
## $ surprising <int> 3, 5, 3, 3, 7, 4, 5, 3, 6, 6, 7, 7, 9, 3, 6, 6, 6, 3, 2, 7,…
## $ q1 <dbl> -88, -30, -85, -88, -86, -50, -80, -50, -88, 88, -88, -88, …
## $ q2 <int> 15, 30, 20, 30, 32, 20, -20, 300, 25, 25, 302, 300, 25, 10,…
## $ q3 <dbl> 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1,…
## $ q4 <dbl> 15, 14, 35, 15, 26, 40, 20, 15, 15, 15, 15, 15, 16, 15, 15,…
## $ q5 <dbl> 33, 10, 36, 30, 32, 30, 60, 15, 32, 32, 20, 33, 34, 30, 35,…
## $ q6 <int> -80, -50, -86, -88, -36, -66, -84, 85, 94, 88, -17, -90, 84…
## $ q7 <int> -33, -5, 100, -33, 16, -30, -40, -30, -33, 55, -20, -33, -3…
## $ q8 <dbl> 58, 51, 50, 56, 56, 60, 80, 56, 56, 53, 50, 55, 52, 56, 56,…
## $ q9 <dbl> -50, -30, -30, -51, -17, -40, -30, -15, -52, -50, -50, -50,…
## $ q10 <int> 33, 10, 300, 300, 40, 20, -60, 300, 302, 302, 302, 300, 302…
## $ age <int> 27, 18, 21, 45, 17, 25, 38, 32, 17, 23, 19, 25, 17, 41, 20,…
## $ gender <fct> m, m, m, m, m, m, none, f, nb, none, m, m, m, m, none, m, f…
## $ edu <fct> 4yr, high, high, 4yr, NA, 4yr, high, phd, grade, 4yr, high,…
## $ group <fct> Control, Control, Control, Control, Control, Control, Contr…
## $ avg_score <dbl> -9.6, 0.1, 34.3, 17.2, 6.4, -1.5, -15.3, 67.7, 35.2, 60.9, …
# Conduct a non-parametric t.test to compare the groups by avg_score
geneffWilcox <- wilcox.test(avg_score ~ group, data = geneff)
geneffWilcox
##
## Wilcoxon rank sum test with continuity correction
##
## data: avg_score by group
## W = 307582, p-value = 0.0003444
## alternative hypothesis: true location shift is not equal to 0