inst-314-exercise-04-demo

This is my analysis of Nicky Case’s online experiment on the generation effect.

# Load the necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(ggpubr)
library(ggplotify)
library(ggthemes)

# Load the data

library(RCurl)

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

learnExp<- getURL("https://raw.githubusercontent.com/cdromcom/inst314jp/refs/heads/main/ncase-generation-effect-data-clean.csv")

geneff <- read.csv(text = learnExp)

# Check the structure of the data
str(geneff)

## 'data.frame':    1620 obs. of  18 variables:
##  $ X         : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ timestamp : chr  "2019/01/10 11:13:00 AM EST" "2019/01/10 11:32:59 AM EST" "2019/01/10 11:33:55 AM EST" "2019/01/10 11:34:50 AM EST" ...
##  $ likeable  : int  7 4 3 7 7 5 7 6 8 3 ...
##  $ surprising: int  3 5 3 3 7 4 5 3 6 6 ...
##  $ q1        : num  -88 -30 -85 -88 -86 -50 -80 -50 -88 88 ...
##  $ q2        : int  15 30 20 30 32 20 -20 300 25 25 ...
##  $ q3        : num  1 1 3 1 1 1 1 1 1 1 ...
##  $ q4        : num  15 14 35 15 26 40 20 15 15 15 ...
##  $ q5        : num  33 10 36 30 32 30 60 15 32 32 ...
##  $ q6        : int  -80 -50 -86 -88 -36 -66 -84 85 94 88 ...
##  $ q7        : int  -33 -5 100 -33 16 -30 -40 -30 -33 55 ...
##  $ q8        : num  58 51 50 56 56 60 80 56 56 53 ...
##  $ q9        : num  -50 -30 -30 -51 -17 -40 -30 -15 -52 -50 ...
##  $ q10       : int  33 10 300 300 40 20 -60 300 302 302 ...
##  $ age       : int  27 18 21 45 17 25 38 32 17 23 ...
##  $ gender    : chr  "m" "m" "m" "m" ...
##  $ edu       : chr  "4yr" "high" "high" "4yr" ...
##  $ group     : int  0 0 0 0 0 0 0 0 0 0 ...

# Check the first few rows of the data
head(geneff)

# Check the number of rows and columns
dim(geneff)

## [1] 1620   18

# Check the column names
colnames(geneff)

##  [1] "X"          "timestamp"  "likeable"   "surprising" "q1"        
##  [6] "q2"         "q3"         "q4"         "q5"         "q6"        
## [11] "q7"         "q8"         "q9"         "q10"        "age"       
## [16] "gender"     "edu"        "group"

# Check the unique values in the 'condition' column
unique(geneff$condition)

## NULL

# Check the unique values in the 'item' column
unique(geneff$item)

## NULL

# Check the unique values in the 'response' column
unique(geneff$response)

## NULL

# Check the unique values in the 'correct' column
unique(geneff$correct)

## NULL

# Check the unique values in the 'rt' column
unique(geneff$rt)

## NULL

summary(geneff)

##        X           timestamp            likeable       surprising    
##  Min.   :   0.0   Length:1620        Min.   : 1.00   Min.   : 1.000  
##  1st Qu.: 404.8   Class :character   1st Qu.: 5.00   1st Qu.: 4.000  
##  Median : 809.5   Mode  :character   Median : 7.00   Median : 6.000  
##  Mean   : 809.5                      Mean   : 6.58   Mean   : 5.351  
##  3rd Qu.:1214.2                      3rd Qu.: 8.00   3rd Qu.: 7.000  
##  Max.   :1619.0                      Max.   :10.00   Max.   :10.000  
##                                                                      
##        q1                q2                  q3                q4         
##  Min.   :-300.00   Min.   :    -88.0   Min.   :    -30   Min.   : -15.00  
##  1st Qu.: -88.00   1st Qu.:     25.0   1st Qu.:      1   1st Qu.:  15.00  
##  Median : -80.00   Median :     30.0   Median :      1   Median :  15.00  
##  Mean   : -57.46   Mean   :    676.8   Mean   :   1236   Mean   :  18.53  
##  3rd Qu.: -47.50   3rd Qu.:     51.0   3rd Qu.:      1   3rd Qu.:  16.00  
##  Max.   :  98.00   Max.   :1001001.0   Max.   :2000000   Max.   :1512.00  
##                                                                           
##        q5                q6                 q7                q8       
##  Min.   : -36.00   Min.   :-1000.00   Min.   :-300.00   Min.   : 0.00  
##  1st Qu.:  28.00   1st Qu.:  -84.00   1st Qu.: -33.00   1st Qu.:53.00  
##  Median :  32.00   Median :  -80.00   Median : -30.00   Median :56.00  
##  Mean   :  33.12   Mean   :  -55.57   Mean   : -21.36   Mean   :54.46  
##  3rd Qu.:  33.00   3rd Qu.:  -50.00   3rd Qu.: -10.00   3rd Qu.:56.00  
##  Max.   :5625.00   Max.   : 9001.00   Max.   : 130.00   Max.   :98.00  
##                                                                        
##        q9               q10              age           gender         
##  Min.   :-200.00   Min.   :-302.0   Min.   :11.00   Length:1620       
##  1st Qu.: -51.00   1st Qu.: 200.0   1st Qu.:23.00   Class :character  
##  Median : -50.00   Median : 300.0   Median :29.00   Mode  :character  
##  Mean   : -35.89   Mean   : 241.7   Mean   :31.32                     
##  3rd Qu.: -30.00   3rd Qu.: 302.0   3rd Qu.:37.00                     
##  Max.   : 302.00   Max.   :3000.0   Max.   :80.00                     
##                                     NA's   :129                       
##      edu                group       
##  Length:1620        Min.   :0.0000  
##  Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :1.0000  
##                     Mean   :0.5006  
##                     3rd Qu.:1.0000  
##                     Max.   :1.0000  
##

# Create summary tables of each variable

summary(geneff$condition)

## Length  Class   Mode 
##      0   NULL   NULL

# Create summary tables of each variable
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

stargazer(geneff, type = "text", title = "Summary Statistics", digits = 2)

## 
## Summary Statistics
## ========================================================
## Statistic    N     Mean   St. Dev.    Min       Max     
## --------------------------------------------------------
## X          1,620  809.50   467.80      0       1,619    
## likeable   1,620   6.58     2.17       1         10     
## surprising 1,620   5.35     2.06       1         10     
## q1         1,620  -57.46    54.01   -300.00    98.00    
## q2         1,620  676.82  24,868.76   -88    1,001,001  
## q3         1,620 1,236.40 49,690.35 -30.00  2,000,000.00
## q4         1,620  18.53     38.69   -15.00    1,512.00  
## q5         1,620  33.11    139.51   -36.00    5,625.00  
## q6         1,620  -55.57   237.12   -1,000     9,001    
## q7         1,620  -21.36    29.83    -300       130     
## q8         1,620  54.46     10.07    0.00      98.00    
## q9         1,620  -35.89    32.80   -200.00    302.00   
## q10        1,620  241.67   141.08    -302      3,000    
## age        1,491  31.32     11.42     11         80     
## group      1,620   0.50     0.50       0         1      
## --------------------------------------------------------

# Check the unique values in the 'group' column
unique(geneff$group)

## [1] 0 1

# Check the group variable

class(geneff$group)

## [1] "integer"

# factor the group variable

geneff$group <- factor(geneff$group, levels = c("0", "1"), labels = c("Control", "Generation"))

# Check the group variable again
class(geneff$group)

## [1] "factor"

levels(geneff$group)

## [1] "Control"    "Generation"

unique(geneff$group)

## [1] Control    Generation
## Levels: Control Generation

unique(geneff$group)

## [1] Control    Generation
## Levels: Control Generation

# Check the unique values in the age column
unique(geneff$age)

##  [1] 27 18 21 45 17 25 38 32 23 19 41 20 26 28 31 36 22 40 NA 34 24 37 39 29 43
## [26] 33 30 35 48 51 44 52 59 15 54 42 47 56 49 71 67 75 16 46 57 68 64 58 72 11
## [51] 14 53 50 60 65 55 62 61 63 13 12 69 80 66

# Drop the NA values in the age column
geneff <- geneff %>% filter(!is.na(age))
# Check the unique values in the age column again
unique(geneff$age)

##  [1] 27 18 21 45 17 25 38 32 23 19 41 20 26 28 31 36 22 40 34 24 37 39 29 43 33
## [26] 30 35 48 51 44 52 59 15 54 42 47 56 49 71 67 75 16 46 57 68 64 58 72 11 14
## [51] 53 50 60 65 55 62 61 63 13 12 69 80 66

# Create a ridgeline density plot of the ages

library(ggridges)
library(ggplot2)
library(viridis)

## Loading required package: viridisLite

ggplot(geneff, aes(x = age, y = group, fill = group)) +
  geom_density_ridges() +
  #scale_fill_viridis_d(option = "A") +
  coord_cartesian(clip = "off")+
  ggtitle("Ages (in years) of Online Participants by Experimental Condition") +
  #xlab("Age") +
  #ylab("Group") +
  #theme_ridges()
  # scale x-axis from 0 to 100
  scale_x_continuous(limits = c(0, 100)) +
  # use theme_minimal and remove the legend
  theme_minimal() +
  theme(legend.position = "none")

## Picking joint bandwidth of 2.59

# Create a ridgeline density plot of the likeability by condition

ggplot(geneff, aes(x = likeable, y = group, fill = group)) +
  geom_density_ridges() +
  #scale_fill_viridis_d(option = "A") +
  coord_cartesian(clip = "off")+
  ggtitle("Likeability ratings of the study by condition") +
  #xlab("Age") +
  #ylab("Group") +
  #theme_ridges()
  # scale x-axis from 0 to 10
  scale_x_continuous(limits = c(0, 12)) +
  # use theme_minimal and remove the legend
  theme_minimal() +
  theme(legend.position = "none")

## Picking joint bandwidth of 0.445

# Create a ridgeline density plot of the surprise by condition

ggplot(geneff, aes(x = surprising, y = group, fill = group)) +
  geom_density_ridges() +
  #scale_fill_viridis_d(option = "A") +
  coord_cartesian(clip = "off")+
  ggtitle("Likeability ratings of the study by condition") +
  #xlab("Age") +
  #ylab("Group") +
  #theme_ridges()
  # scale x-axis from 0 to 10
  scale_x_continuous(limits = c(0, 12)) +
  # use theme_minimal and remove the legend
  theme_minimal() +
  theme(legend.position = "none")

## Picking joint bandwidth of 0.479

# Summary the q1-q10 values in a sum score and store than in a new column called avg_score

geneff <- geneff %>%
  mutate(avg_score = rowMeans(select(., starts_with("q")), na.rm = TRUE))

# Create a ridgeline density plot of the avg_score by condition

ggplot(geneff, aes(x = avg_score, y = group, fill = group)) +
  geom_density_ridges() +
  #scale_fill_viridis_d(option = "A") +
  coord_cartesian(clip = "off")+
  ggtitle("Distributions of average scores by condition") +
  #xlab("Age") +
  #ylab("Group") +
  #theme_ridges()
  # scale x-axis from 0 to 100
  scale_x_continuous(limits = c(-100, 100)) +
  # use theme_minimal and remove the legend
  theme_minimal() +
  theme(legend.position = "none")

## Picking joint bandwidth of 2.9

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

# Create overlapping ridgeline density plots of the avg_score by condition

ggplot(geneff, aes(x = avg_score, y = group, fill = group)) +
  geom_density_ridges(alpha = 0.5) +
  #scale_fill_viridis_d(option = "A") +
  coord_cartesian(clip = "off")+
  ggtitle("Distributions of average scores by condition") +
  #xlab("Age") +
  #ylab("Group") +
  #theme_ridges()
  # scale x-axis from 0 to 100
  scale_x_continuous(limits = c(-100, 100)) +
  # use theme_minimal and remove the legend
  theme_minimal() +
  theme(legend.position = "none")

## Picking joint bandwidth of 2.9

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

# Create a raincloud plot of the new avg_score variable by group

library(ggplot2)
library(ggdist)

## 
## Attaching package: 'ggdist'

## The following objects are masked from 'package:ggridges':
## 
##     scale_point_color_continuous, scale_point_color_discrete,
##     scale_point_colour_continuous, scale_point_colour_discrete,
##     scale_point_fill_continuous, scale_point_fill_discrete,
##     scale_point_size_continuous

ggplot(geneff, aes(x = avg_score, y = group)) +
  # use geom_stat_halfeye to create the raincloud plot
  geom_slab(aes(fill = group), alpha = 0.5) +
  #geom_half_violin(aes(fill = group), side = "r", alpha = 0.5) +
  geom_point(aes(color = group), position = position_jitter(width = 0.1), size = 2) +
  geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.5) +
  scale_fill_manual(values = c("#F8766D", "#00BFC4")) +
  scale_color_manual(values = c("#F8766D", "#00BFC4")) +
  theme_minimal() +
  # y-axis scale between 0 and 10
  scale_x_continuous(limits = c(-100, 100)) +
  labs(title = "Raincloud plot of avg_score by group",
       x = "Average Score")

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

  theme(legend.position = "none")

## List of 1
##  $ legend.position: chr "none"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

library(beeswarm)
# Create a histogram of the q1 variable

ggplot(geneff, aes(x = q1)) +
  geom_histogram(binwidth = 10, fill = "maroon", color = "maroon", alpha = 0.5) +
  labs(title = "Histogram of Question 1",
       x = "Average Score",
       y = "Count") +
  # scale x-axis from -200 to 200
  scale_x_continuous(limits = c(-300, 300)) +
  facet_wrap(~ group) +
  theme_minimal()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

# Use a chi-squared test to determine whether where were more people in the control group or the generation group

geneff %>%
  group_by(group) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(prop = n / sum(n)) %>%
  select(group, prop)

# summarize the gender variable
summary(geneff$gender)

##    Length     Class      Mode 
##      1491 character character

class(geneff$gender)

## [1] "character"

#factor gender
geneff$gender <- as.factor(geneff$gender)

# check gender
class(geneff$gender)

## [1] "factor"

unique(geneff$gender)

## [1] m    none f    nb  
## Levels: f m nb none

# Summarize the gender variable

geneff %>%
  group_by(gender) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(prop = n / sum(n)) %>%
  select(gender, prop)

# Create a crosstabulation of gender and group
library(gmodels)
CrossTable(geneff$gender, geneff$group, prop.chisq = TRUE, prop.t = FALSE, prop.r = FALSE, prop.c = FALSE)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |-------------------------|
## 
##  
## Total Observations in Table:  1491 
## 
##  
##               | geneff$group 
## geneff$gender |    Control | Generation |  Row Total | 
## --------------|------------|------------|------------|
##             f |        123 |        140 |        263 | 
##               |      0.341 |      0.331 |            | 
## --------------|------------|------------|------------|
##             m |        529 |        531 |       1060 | 
##               |      0.080 |      0.078 |            | 
## --------------|------------|------------|------------|
##            nb |         17 |         25 |         42 | 
##               |      0.663 |      0.644 |            | 
## --------------|------------|------------|------------|
##          none |         66 |         60 |        126 | 
##               |      0.243 |      0.237 |            | 
## --------------|------------|------------|------------|
##  Column Total |        735 |        756 |       1491 | 
## --------------|------------|------------|------------|
## 
##

# Run a chi-squared test of independence for gender and group
chisq.test(geneff$gender, geneff$group)

## 
##  Pearson's Chi-squared test
## 
## data:  geneff$gender and geneff$group
## X-squared = 2.6169, df = 3, p-value = 0.4545

# Check education variable
class(geneff$education)

## [1] "NULL"

unique(geneff$education)

## NULL

head(geneff$education)

## NULL

names(geneff)

##  [1] "X"          "timestamp"  "likeable"   "surprising" "q1"        
##  [6] "q2"         "q3"         "q4"         "q5"         "q6"        
## [11] "q7"         "q8"         "q9"         "q10"        "age"       
## [16] "gender"     "edu"        "group"      "avg_score"

# Factor the education variable and rename the levels

geneff$edu <- factor(geneff$edu, levels = c("grade", "high", "2yr", "4yr", "masters", "phd"))

# Check the education variable again
class(geneff$edu)

## [1] "factor"

levels(geneff$edu)

## [1] "grade"   "high"    "2yr"     "4yr"     "masters" "phd"

# Summarize the education variable
geneff %>%
  group_by(edu) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(prop = n / sum(n)) %>%
  select(edu, prop)

# Check assumptions for an independent samples t-test

library(performance)

geneffTest <- lm(avg_score ~ group, data = geneff)
check_model(geneffTest)

plot(geneffTest)

# Create a summary table of geneffTest using stargazer
stargazer(geneffTest, type = "text", title = "Summary of geneffTest", digits = 2)

## 
## Summary of geneffTest
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                              avg_score         
## -----------------------------------------------
## groupGeneration               125.88           
##                              (134.28)          
##                                                
## Constant                       27.21           
##                               (95.61)          
##                                                
## -----------------------------------------------
## Observations                   1,491           
## R2                             0.001           
## Adjusted R2                   -0.0001          
## Residual Std. Error    2,592.16 (df = 1489)    
## F Statistic             0.88 (df = 1; 1489)    
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

# Run an independent samples t-test
geneffTTest <- t.test(avg_score ~ group, data = geneff, var.equal = TRUE)
geneffTTest

## 
##  Two Sample t-test
## 
## data:  avg_score by group
## t = -0.9375, df = 1489, p-value = 0.3487
## alternative hypothesis: true difference in means between group Control and group Generation is not equal to 0
## 95 percent confidence interval:
##  -389.2713  137.5056
## sample estimates:
##    mean in group Control mean in group Generation 
##                 27.21218                153.09505

normcheck <- check_normality(geneff$avg_score)
normcheck

## Warning: Non-normality of raw detected (p < .001).

#check_outliers(geneff$avg_score)

check_collinearity(geneff$avg_score)

## Could not extract the variance-covariance matrix for the conditional
##   component of the model. Please try to run `vcov(model)`, which may help
##   identifying the problem.

## NULL

# Run a one-way ANOVA predicting avg_score from group

anovaTest <- aov(avg_score ~ group, data = geneff)
summary(anovaTest)

##               Df    Sum Sq Mean Sq F value Pr(>F)
## group          1 5.906e+06 5905610   0.879  0.349
## Residuals   1489 1.000e+10 6719274

glimpse(geneff)

## Rows: 1,491
## Columns: 19
## $ X          <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ timestamp  <chr> "2019/01/10 11:13:00 AM EST", "2019/01/10 11:32:59 AM EST",…
## $ likeable   <int> 7, 4, 3, 7, 7, 5, 7, 6, 8, 3, 7, 6, 7, 3, 3, 4, 7, 6, 7, 7,…
## $ surprising <int> 3, 5, 3, 3, 7, 4, 5, 3, 6, 6, 7, 7, 9, 3, 6, 6, 6, 3, 2, 7,…
## $ q1         <dbl> -88, -30, -85, -88, -86, -50, -80, -50, -88, 88, -88, -88, …
## $ q2         <int> 15, 30, 20, 30, 32, 20, -20, 300, 25, 25, 302, 300, 25, 10,…
## $ q3         <dbl> 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1,…
## $ q4         <dbl> 15, 14, 35, 15, 26, 40, 20, 15, 15, 15, 15, 15, 16, 15, 15,…
## $ q5         <dbl> 33, 10, 36, 30, 32, 30, 60, 15, 32, 32, 20, 33, 34, 30, 35,…
## $ q6         <int> -80, -50, -86, -88, -36, -66, -84, 85, 94, 88, -17, -90, 84…
## $ q7         <int> -33, -5, 100, -33, 16, -30, -40, -30, -33, 55, -20, -33, -3…
## $ q8         <dbl> 58, 51, 50, 56, 56, 60, 80, 56, 56, 53, 50, 55, 52, 56, 56,…
## $ q9         <dbl> -50, -30, -30, -51, -17, -40, -30, -15, -52, -50, -50, -50,…
## $ q10        <int> 33, 10, 300, 300, 40, 20, -60, 300, 302, 302, 302, 300, 302…
## $ age        <int> 27, 18, 21, 45, 17, 25, 38, 32, 17, 23, 19, 25, 17, 41, 20,…
## $ gender     <fct> m, m, m, m, m, m, none, f, nb, none, m, m, m, m, none, m, f…
## $ edu        <fct> 4yr, high, high, 4yr, NA, 4yr, high, phd, grade, 4yr, high,…
## $ group      <fct> Control, Control, Control, Control, Control, Control, Contr…
## $ avg_score  <dbl> -9.6, 0.1, 34.3, 17.2, 6.4, -1.5, -15.3, 67.7, 35.2, 60.9, …

# Conduct a non-parametric t.test to compare the groups by avg_score

geneffWilcox <- wilcox.test(avg_score ~ group, data = geneff)
geneffWilcox

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  avg_score by group
## W = 307582, p-value = 0.0003444
## alternative hypothesis: true location shift is not equal to 0