Human-Generated Content Achieves More Divergence than LLM-Generated Content

An Empirical Comparison of Human and ChatGPT Writing

Author

Kibum Moon, Elizabeth Kronthal, Adam Green, Kostadin Kushlev

Published

April 7, 2024

Code
#------------------------------------------------------------------------------------------
# Copyright © 2024 Kibum Moon. 
# This work is licensed under CC-by Attribution 4.0 International License. 
#------------------------------------------------------------------------------------------

# loading packages
library(tidyverse)
library(broom)
library(afex)
library(emmeans)
library(sjPlot)
library(scales)

library(ggpubr)
library(papaja)
library(plotly)
library(ggeasy)

library(janitor)
library(here)

theme_set(theme_apa() +
            easy_rotate_x_labels(45, "right") +
              theme(legend.position = "top"))

# rstudioapi::getActiveDocumentContext()$path %>%
#   dirname() %>%
#   setwd()

# Read in datasets and assign labels for authorship
s1_dsi_r <- read_csv("./spsp_s1_dsi/spsp_s1_random_20240122.csv") %>%
  mutate(authorship = "Regular Human") 

s1_dsi_d <- read_csv("./spsp_s1_dsi/spsp_s1_diverse_20240122.csv") %>%
  mutate(authorship = "Diverse Human") 

s1_dsi_g <- read_csv("./spsp_s1_dsi/spsp_s1_gpt_20240123.csv") %>%
  select(-`...1`) %>%
  mutate(authorship = "GPT4") 

# Combine datasets into a single dataframe
s1_dsi <- bind_rows(
  s1_dsi_r %>% select(authorship, dsi), 
  s1_dsi_d %>% select(authorship, dsi), 
  s1_dsi_g %>% select(authorship, dsi)
) %>% 
  mutate(dsi = dsi * 1000)

Research Question

  • RQ1: Are human written essays more semantically diverse than GPT-4-generated essays?

  • RQ2: Can diversify race/ethnicity of essay authors benefit higher collective creativity?

Note

  • The DSI calculation was conducted using Python scripts and is not included here. Please contact me at km1735@georgetown.edu if you need assistance.

Method

Sample Demographics

General Sample

  • Randomly select 200 essays out of ~50k archival essay data.
  • Three applicant provided NA responses for all race/ethnic questions.
Code
# Analyze sample demographics for the general sample
s1_dsi_r %>%
  select(ref, contains("race")) %>%
  select(-race_all) %>%
  mutate(race_other = ifelse(rowSums(across(starts_with("race"), ~ .x == "Y")) == 0, "Y", "N")) %>%
  pivot_longer(-ref) %>%
  filter(value == "Y") %>%
  group_by(ref) %>%
  summarise(race = paste0(name, collapse = ", ")) %>%
  count(race, sort = TRUE) %>%
  mutate(ratio = percent((n/sum(n)), accuracy = 0.1)) %>%
  tab_df()
race n ratio
race_white 76 38.6%
race_asian 43 21.8%
race_other 33 16.8%
race_hispanic, race_white 16 8.1%
race_african_american_or_black 11 5.6%
race_asian, race_white 9 4.6%
race_african_american_or_black, race_hispanic 3 1.5%
race_african_american_or_black, race_white 2 1.0%
race_african_american_or_black, race_american_indian_or_alaskan_native, race_hispanic 1 0.5%
race_african_american_or_black, race_asian 1 0.5%
race_asian, race_hispanic 1 0.5%
race_hispanic 1 0.5%

Diverse Sample

  • Exclude applicants who denoted more than one ethnic identity

  • Randomly select applicants from the following six race/ethnic groups, ensuring an even distribution across each group

    • race_african_american_or_black

    • race_american_indian_or_alaskan_native

    • race_asian

    • race_hispanic

    • race_native_hawaiian_or_pacific_islander

    • race_white

  • After selecting 34 applicants from six groups (n = 204), randomly select 200 applicants out of those 204.

Code
# Analyze sample demographics for the diverse sample
s1_dsi_d %>%
  select(ref, contains("race")) %>%
  select(-race_all) %>%
  mutate(race_other = ifelse(!rowSums(across(starts_with("race"), ~ .x == "Y")) > 0, "Y", "N")) %>%
  pivot_longer(-ref) %>%
  filter(value == "Y") %>%
  group_by(ref) %>%
  summarise(race = paste0(name, collapse = ", ")) %>%
  count(race, sort = T) %>%
  mutate(ratio = percent((n/sum(n)), accuracy = 0.1)) %>%
  tab_df()
race n ratio
race_african_american_or_black 34 17.0%
race_asian 34 17.0%
race_native_hawaiian_or_pacific_islander 34 17.0%
race_white 34 17.0%
race_american_indian_or_alaskan_native 33 16.5%
race_hispanic 31 15.5%

Results

At Individual Level

Code
# s1_dsi %>% 
#   mutate(authorship = fct_relevel(authorship, "Diverse Human", "Regular Human" , "GPT4")) %>% 
#   ggplot(aes(dsi, authorship, fill = authorship)) +
#   ggridges::geom_density_ridges() +
#   ggridges::theme_ridges() +
#   theme(legend.position = "none") +
#   labs(fill = "Authorship",
#        x = "DSI",
#        y = "Density") +
#   scale_fill_brewer(palette = "Set1", direction = 1)


p_s1_dsi_dist <- s1_dsi %>%
  mutate(authorship = fct_rev(authorship)) %>%
  ggplot(aes(dsi, fill = authorship)) +
  geom_density(alpha = 0.8) +
  labs(fill = "Authorship",
       x = "DSI",
       y = "Density") +
  scale_fill_brewer(palette = "Set1", direction = -1)

p_s1_dsi_dist

Code
anova_s1 <- s1_dsi %>%
  lm(dsi ~ authorship, data = .)

emmeans_results <- emmeans(anova_s1, ~authorship)

s1_pairs <- pairs(emmeans_results, adjust = "holm") %>%
  tidy()

eff_size(emmeans_results,
         sigma = sigma(anova_s1),
         edf = df.residual(anova_s1))
 contrast                      effect.size    SE  df lower.CL upper.CL
 Diverse Human - GPT4                0.305 0.100 597    0.108   0.5026
 Diverse Human - Regular Human      -0.130 0.100 597   -0.327   0.0663
 GPT4 - Regular Human               -0.436 0.101 597   -0.634  -0.2377

sigma used for effect sizes: 7.023 
Confidence level used: 0.95 
Code
s1_dsi %>%
  group_by(authorship) %>%
  summarise(m = mean(dsi),
            sd = sd(dsi)) %>%

  tab_df(digits = 3)
authorship m sd
Diverse Human 829.004 8.143
GPT4 826.859 3.884
Regular Human 829.919 8.159
Code
s1_p1 <- s1_dsi %>%
  mutate(authorship = fct_relevel(authorship, "GPT4", "Regular Human", "Diverse Human")) %>% 
  ggplot(aes(authorship, dsi, color = authorship)) +
  stat_summary(fun.data = mean_cl_normal) +
  stat_summary(geom = "errorbar", fun.data = mean_cl_normal, width = 0.05)  +
  ggpubr::stat_anova_test(label.y = 833) +  
    scale_color_brewer(palette = "Set1", direction = -1)

s1_p1

Code
# ggsave("./plots/figure1.png", s1_p1, width = 6, height = 6)
# ggsave("./plots/spsp_dsi_dist.pdf", p_s1_dsi_dist, width = 10, height = 7)

At Aggregated Level

  • Bootstrapping sample size: n = 5,000
Code
s1_dsi_agg_raw <-  list.files("./spsp_s1_dsi_boot", full.names = T) %>%
  tibble() %>%
  mutate(cate = str_extract(., "(?<=dsi_boot/).*(?=_nboot[0-9]000.csv)")) %>%
  filter(str_detect(., "nboot5000")) %>%
  mutate(authorship = str_extract(cate, ".*(?=_)")) %>%
  filter(!is.na(authorship)) %>%
  filter(authorship != "spsp_s1_white") %>%
  mutate(df = map(., ~ read_csv(.))) %>%
  unnest(df) %>%
  filter(n_essay <= 34) %>% 
  mutate(authorship = case_match(authorship,
                                 "spsp_s1_diverse" ~ "Diverse Human",
                                 "spsp_s1_random" ~ "Regular Human",
                                 "spsp_s1_gpt" ~ "GPT4")) %>% 
    mutate(dsi = dsi * 1000)



s1_dsi_agg <- bind_rows(
  s1_dsi_agg_raw %>%
    select(authorship, n_essay, dsi),
  s1_dsi %>%
    select(authorship, dsi) %>%
    mutate(n_essay = 1)
)


p <- s1_dsi_agg %>%
  
  ggplot(aes(factor(n_essay), dsi, color = authorship, group = authorship)) +
  stat_summary(geom = 'point') +
  stat_summary(geom = 'line') +
  labs(x = "The number of essays in each bootstrapping",
       y = "DSI")
ggplotly(p)

Curvilinear analysis

Code
s1_delta_dsi <- s1_dsi_agg %>%
  arrange(authorship, n_essay) %>% 
  select(dsi, n_essay, authorship) %>% 
  group_by(authorship, n_essay) %>%
  summarise(dsi = mean(dsi))  %>%
  ungroup() %>%
  group_by(authorship) %>%
  mutate(delta_dsi = dsi - dsi[n_essay == 1]) %>%
  mutate(log_n_essay = log(n_essay)) %>%
  ungroup() %>% 
  filter(n_essay >= 2) %>% 
  mutate(authorship = fct_relevel(authorship, "Diverse Human", "Regular Human", "GPT4"))


s1_p2 <- s1_delta_dsi %>%
  mutate(authorship = fct_rev(authorship)) %>% 
  ggplot(aes(n_essay, delta_dsi, color = authorship)) +
  geom_point(alpha = 0.3) +
  stat_smooth(method = "lm",
              formula = 'y ~ log(x)',
              se = T, level = 0.95) +
  labs(x = "Number of Essays",
       y = expression(
         atop(
           "Collective Creativity"
         )
       )) +
  scale_x_continuous(breaks = 2:34) +
  ggeasy::easy_add_legend_title("Authorship") +
  ggeasy::easy_move_legend("top") +
  ggeasy::easy_rotate_x_labels(0, "right") +
  theme_minimal() + 
  easy_remove_legend() +
  scale_color_brewer(palette = "Set1", direction = -1)

s1_p2

Code
# ggsave("./plots/SfNC_figure2.png", s1_p2, width = 9, height = 5)


s1_delta_dsi <- s1_delta_dsi %>% 
  mutate(authorship = fct_relevel(authorship, "Regular Human"))

s1_m1 <- s1_delta_dsi %>%
  lm(delta_dsi ~  authorship, data =.)
s1_m2 <- s1_delta_dsi %>%
  lm(delta_dsi ~ log(n_essay), data =.)
s1_m3 <- s1_delta_dsi %>%
  lm(delta_dsi ~ authorship + log(n_essay), data =.)
s1_m4 <- s1_delta_dsi %>%
  lm(delta_dsi ~ authorship * log(n_essay), data =.)
s1_m3 %>%
  confint(level=0.95) %>%
  round(digits = 3)
                         2.5 % 97.5 %
(Intercept)              3.097  3.572
authorshipDiverse Human  0.803  1.089
authorshipGPT4          -4.081 -3.795
log(n_essay)             0.387  0.547
Code
s1_m4 %>%
  confint(level=0.95) %>%
    round(digits = 3)
                                      2.5 % 97.5 %
(Intercept)                           2.757  3.240
authorshipDiverse Human               0.133  0.815
authorshipGPT4                       -2.799 -2.117
log(n_essay)                          0.505  0.679
authorshipDiverse Human:log(n_essay)  0.053  0.298
authorshipGPT4:log(n_essay)          -0.674 -0.429
Code
anova(s1_m3,s1_m4)
Analysis of Variance Table

Model 1: delta_dsi ~ authorship + log(n_essay)
Model 2: delta_dsi ~ authorship * log(n_essay)
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1     95 8.1206                                  
2     93 3.0969  2    5.0237 75.431 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
tab_model(s1_m3, s1_m4, digits = 3, show.ci = F, p.style = "stars", dv.labels = c("Agg. M1", "Agg. M2"))
  Agg. M1 Agg. M2
Predictors Estimates Estimates
(Intercept) 3.334 *** 2.998 ***
authorship [Diverse
Human]
0.946 *** 0.474 **
authorship [GPT4] -3.938 *** -2.458 ***
n essay [log] 0.467 *** 0.592 ***
authorship [Diverse
Human] × n essay [log]
0.176 **
authorship [GPT4] × n
essay [log]
-0.551 ***
Observations 99 99
R2 / R2 adjusted 0.982 / 0.982 0.993 / 0.993
* p<0.05   ** p<0.01   *** p<0.001
Code
s1_r2_m3 <- summary(s1_m3)$r.squared
s1_r2_m4 <- summary(s1_m4)$r.squared
s1_r2_m4 - s1_r2_m3
[1] 0.01086771
Code
s1_delta_dsi %>%
  filter(authorship == "Regular Human") %>%
  lm(delta_dsi ~ log(n_essay), data =.) %>%
  tab_model()
  delta dsi
Predictors Estimates CI p
(Intercept) 3.00 2.72 – 3.28 <0.001
n essay [log] 0.59 0.49 – 0.69 <0.001
Observations 33
R2 / R2 adjusted 0.824 / 0.818
Code
s1_delta_dsi %>%
  filter(authorship == "Diverse Human") %>%
  lm(delta_dsi ~ log(n_essay), data =.) %>%
  tab_model()
  delta dsi
Predictors Estimates CI p
(Intercept) 3.47 3.15 – 3.79 <0.001
n essay [log] 0.77 0.65 – 0.88 <0.001
Observations 33
R2 / R2 adjusted 0.855 / 0.850
Code
s1_delta_dsi %>%
  filter(authorship == "GPT4") %>%
  lm(delta_dsi ~ log(n_essay), data =.) %>%
  tab_model()
  delta dsi
Predictors Estimates CI p
(Intercept) 0.54 0.49 – 0.59 <0.001
n essay [log] 0.04 0.02 – 0.06 <0.001
Observations 33
R2 / R2 adjusted 0.425 / 0.406