Human-Generated Content Achieves More Divergence than LLM-Generated Content

An Empirical Comparison of Human and ChatGPT Writing

Author

Kibum Moon, Elizabeth Kronthal, Adam Green, Kostadin Kushlev

Published

April 7, 2024

Code

#------------------------------------------------------------------------------------------
# Copyright © 2024 Kibum Moon. 
# This work is licensed under CC-by Attribution 4.0 International License. 
#------------------------------------------------------------------------------------------

# loading packages
library(tidyverse)
library(broom)
library(afex)
library(emmeans)
library(sjPlot)
library(scales)

library(ggpubr)
library(papaja)
library(plotly)
library(ggeasy)

library(janitor)
library(here)

theme_set(theme_apa() +
            easy_rotate_x_labels(45, "right") +
              theme(legend.position = "top"))

# rstudioapi::getActiveDocumentContext()$path %>%
#   dirname() %>%
#   setwd()

# Read in datasets and assign labels for authorship
s1_dsi_r <- read_csv("./spsp_s1_dsi/spsp_s1_random_20240122.csv") %>%
  mutate(authorship = "Regular Human") 

s1_dsi_d <- read_csv("./spsp_s1_dsi/spsp_s1_diverse_20240122.csv") %>%
  mutate(authorship = "Diverse Human") 

s1_dsi_g <- read_csv("./spsp_s1_dsi/spsp_s1_gpt_20240123.csv") %>%
  select(-`...1`) %>%
  mutate(authorship = "GPT4") 

# Combine datasets into a single dataframe
s1_dsi <- bind_rows(
  s1_dsi_r %>% select(authorship, dsi), 
  s1_dsi_d %>% select(authorship, dsi), 
  s1_dsi_g %>% select(authorship, dsi)
) %>% 
  mutate(dsi = dsi * 1000)

Research Question

RQ1: Are human written essays more semantically diverse than GPT-4-generated essays?
RQ2: Can diversify race/ethnicity of essay authors benefit higher collective creativity?

Note

The DSI calculation was conducted using Python scripts and is not included here. Please contact me at km1735@georgetown.edu if you need assistance.

Method

Sample Demographics

General Sample

Randomly select 200 essays out of ~50k archival essay data.
Three applicant provided NA responses for all race/ethnic questions.

Code

# Analyze sample demographics for the general sample
s1_dsi_r %>%
  select(ref, contains("race")) %>%
  select(-race_all) %>%
  mutate(race_other = ifelse(rowSums(across(starts_with("race"), ~ .x == "Y")) == 0, "Y", "N")) %>%
  pivot_longer(-ref) %>%
  filter(value == "Y") %>%
  group_by(ref) %>%
  summarise(race = paste0(name, collapse = ", ")) %>%
  count(race, sort = TRUE) %>%
  mutate(ratio = percent((n/sum(n)), accuracy = 0.1)) %>%
  tab_df()

race	n	ratio
race_white	76	38.6%
race_asian	43	21.8%
race_other	33	16.8%
race_hispanic, race_white	16	8.1%
race_african_american_or_black	11	5.6%
race_asian, race_white	9	4.6%
race_african_american_or_black, race_hispanic	3	1.5%
race_african_american_or_black, race_white	2	1.0%
race_african_american_or_black, race_american_indian_or_alaskan_native, race_hispanic	1	0.5%
race_african_american_or_black, race_asian	1	0.5%
race_asian, race_hispanic	1	0.5%
race_hispanic	1	0.5%

Diverse Sample

Exclude applicants who denoted more than one ethnic identity
Randomly select applicants from the following six race/ethnic groups, ensuring an even distribution across each group
- race_african_american_or_black
- race_american_indian_or_alaskan_native
- race_asian
- race_hispanic
- race_native_hawaiian_or_pacific_islander
- race_white
After selecting 34 applicants from six groups (n = 204), randomly select 200 applicants out of those 204.

Code

# Analyze sample demographics for the diverse sample
s1_dsi_d %>%
  select(ref, contains("race")) %>%
  select(-race_all) %>%
  mutate(race_other = ifelse(!rowSums(across(starts_with("race"), ~ .x == "Y")) > 0, "Y", "N")) %>%
  pivot_longer(-ref) %>%
  filter(value == "Y") %>%
  group_by(ref) %>%
  summarise(race = paste0(name, collapse = ", ")) %>%
  count(race, sort = T) %>%
  mutate(ratio = percent((n/sum(n)), accuracy = 0.1)) %>%
  tab_df()

race	n	ratio
race_african_american_or_black	34	17.0%
race_asian	34	17.0%
race_native_hawaiian_or_pacific_islander	34	17.0%
race_white	34	17.0%
race_american_indian_or_alaskan_native	33	16.5%
race_hispanic	31	15.5%

Results

At Individual Level

Code

# s1_dsi %>% 
#   mutate(authorship = fct_relevel(authorship, "Diverse Human", "Regular Human" , "GPT4")) %>% 
#   ggplot(aes(dsi, authorship, fill = authorship)) +
#   ggridges::geom_density_ridges() +
#   ggridges::theme_ridges() +
#   theme(legend.position = "none") +
#   labs(fill = "Authorship",
#        x = "DSI",
#        y = "Density") +
#   scale_fill_brewer(palette = "Set1", direction = 1)


p_s1_dsi_dist <- s1_dsi %>%
  mutate(authorship = fct_rev(authorship)) %>%
  ggplot(aes(dsi, fill = authorship)) +
  geom_density(alpha = 0.8) +
  labs(fill = "Authorship",
       x = "DSI",
       y = "Density") +
  scale_fill_brewer(palette = "Set1", direction = -1)

p_s1_dsi_dist

Code

anova_s1 <- s1_dsi %>%
  lm(dsi ~ authorship, data = .)

emmeans_results <- emmeans(anova_s1, ~authorship)

s1_pairs <- pairs(emmeans_results, adjust = "holm") %>%
  tidy()

eff_size(emmeans_results,
         sigma = sigma(anova_s1),
         edf = df.residual(anova_s1))

 contrast                      effect.size    SE  df lower.CL upper.CL
 Diverse Human - GPT4                0.305 0.100 597    0.108   0.5026
 Diverse Human - Regular Human      -0.130 0.100 597   -0.327   0.0663
 GPT4 - Regular Human               -0.436 0.101 597   -0.634  -0.2377

sigma used for effect sizes: 7.023 
Confidence level used: 0.95

Code

s1_dsi %>%
  group_by(authorship) %>%
  summarise(m = mean(dsi),
            sd = sd(dsi)) %>%

  tab_df(digits = 3)

authorship	m	sd
Diverse Human	829.004	8.143
GPT4	826.859	3.884
Regular Human	829.919	8.159

Code

s1_p1 <- s1_dsi %>%
  mutate(authorship = fct_relevel(authorship, "GPT4", "Regular Human", "Diverse Human")) %>% 
  ggplot(aes(authorship, dsi, color = authorship)) +
  stat_summary(fun.data = mean_cl_normal) +
  stat_summary(geom = "errorbar", fun.data = mean_cl_normal, width = 0.05)  +
  ggpubr::stat_anova_test(label.y = 833) +  
    scale_color_brewer(palette = "Set1", direction = -1)

s1_p1

Code

# ggsave("./plots/figure1.png", s1_p1, width = 6, height = 6)
# ggsave("./plots/spsp_dsi_dist.pdf", p_s1_dsi_dist, width = 10, height = 7)

At Aggregated Level

Bootstrapping sample size: n = 5,000

Code

s1_dsi_agg_raw <-  list.files("./spsp_s1_dsi_boot", full.names = T) %>%
  tibble() %>%
  mutate(cate = str_extract(., "(?<=dsi_boot/).*(?=_nboot[0-9]000.csv)")) %>%
  filter(str_detect(., "nboot5000")) %>%
  mutate(authorship = str_extract(cate, ".*(?=_)")) %>%
  filter(!is.na(authorship)) %>%
  filter(authorship != "spsp_s1_white") %>%
  mutate(df = map(., ~ read_csv(.))) %>%
  unnest(df) %>%
  filter(n_essay <= 34) %>% 
  mutate(authorship = case_match(authorship,
                                 "spsp_s1_diverse" ~ "Diverse Human",
                                 "spsp_s1_random" ~ "Regular Human",
                                 "spsp_s1_gpt" ~ "GPT4")) %>% 
    mutate(dsi = dsi * 1000)



s1_dsi_agg <- bind_rows(
  s1_dsi_agg_raw %>%
    select(authorship, n_essay, dsi),
  s1_dsi %>%
    select(authorship, dsi) %>%
    mutate(n_essay = 1)
)


p <- s1_dsi_agg %>%
  
  ggplot(aes(factor(n_essay), dsi, color = authorship, group = authorship)) +
  stat_summary(geom = 'point') +
  stat_summary(geom = 'line') +
  labs(x = "The number of essays in each bootstrapping",
       y = "DSI")
ggplotly(p)

Curvilinear analysis

Code

s1_delta_dsi <- s1_dsi_agg %>%
  arrange(authorship, n_essay) %>% 
  select(dsi, n_essay, authorship) %>% 
  group_by(authorship, n_essay) %>%
  summarise(dsi = mean(dsi))  %>%
  ungroup() %>%
  group_by(authorship) %>%
  mutate(delta_dsi = dsi - dsi[n_essay == 1]) %>%
  mutate(log_n_essay = log(n_essay)) %>%
  ungroup() %>% 
  filter(n_essay >= 2) %>% 
  mutate(authorship = fct_relevel(authorship, "Diverse Human", "Regular Human", "GPT4"))


s1_p2 <- s1_delta_dsi %>%
  mutate(authorship = fct_rev(authorship)) %>% 
  ggplot(aes(n_essay, delta_dsi, color = authorship)) +
  geom_point(alpha = 0.3) +
  stat_smooth(method = "lm",
              formula = 'y ~ log(x)',
              se = T, level = 0.95) +
  labs(x = "Number of Essays",
       y = expression(
         atop(
           "Collective Creativity"
         )
       )) +
  scale_x_continuous(breaks = 2:34) +
  ggeasy::easy_add_legend_title("Authorship") +
  ggeasy::easy_move_legend("top") +
  ggeasy::easy_rotate_x_labels(0, "right") +
  theme_minimal() + 
  easy_remove_legend() +
  scale_color_brewer(palette = "Set1", direction = -1)

s1_p2

Code

# ggsave("./plots/SfNC_figure2.png", s1_p2, width = 9, height = 5)


s1_delta_dsi <- s1_delta_dsi %>% 
  mutate(authorship = fct_relevel(authorship, "Regular Human"))

s1_m1 <- s1_delta_dsi %>%
  lm(delta_dsi ~  authorship, data =.)
s1_m2 <- s1_delta_dsi %>%
  lm(delta_dsi ~ log(n_essay), data =.)
s1_m3 <- s1_delta_dsi %>%
  lm(delta_dsi ~ authorship + log(n_essay), data =.)
s1_m4 <- s1_delta_dsi %>%
  lm(delta_dsi ~ authorship * log(n_essay), data =.)
s1_m3 %>%
  confint(level=0.95) %>%
  round(digits = 3)

                         2.5 % 97.5 %
(Intercept)              3.097  3.572
authorshipDiverse Human  0.803  1.089
authorshipGPT4          -4.081 -3.795
log(n_essay)             0.387  0.547

Code

s1_m4 %>%
  confint(level=0.95) %>%
    round(digits = 3)

                                      2.5 % 97.5 %
(Intercept)                           2.757  3.240
authorshipDiverse Human               0.133  0.815
authorshipGPT4                       -2.799 -2.117
log(n_essay)                          0.505  0.679
authorshipDiverse Human:log(n_essay)  0.053  0.298
authorshipGPT4:log(n_essay)          -0.674 -0.429

Code

anova(s1_m3,s1_m4)

Analysis of Variance Table

Model 1: delta_dsi ~ authorship + log(n_essay)
Model 2: delta_dsi ~ authorship * log(n_essay)
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1     95 8.1206                                  
2     93 3.0969  2    5.0237 75.431 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Code

tab_model(s1_m3, s1_m4, digits = 3, show.ci = F, p.style = "stars", dv.labels = c("Agg. M1", "Agg. M2"))

	Agg. M1	Agg. M2
Predictors	Estimates	Estimates
(Intercept)	3.334 ^***	2.998 ^***
authorship [Diverse Human]	0.946 ^***	0.474 ^**
authorship [GPT4]	-3.938 ^***	-2.458 ^***
n essay [log]	0.467 ^***	0.592 ^***
authorship [Diverse Human] × n essay [log]		0.176 ^**
authorship [GPT4] × n essay [log]		-0.551 ^***
Observations	99	99
R² / R² adjusted	0.982 / 0.982	0.993 / 0.993
* p<0.05 p<0.01 * p<0.001

Code

s1_r2_m3 <- summary(s1_m3)$r.squared
s1_r2_m4 <- summary(s1_m4)$r.squared
s1_r2_m4 - s1_r2_m3

[1] 0.01086771

Code

s1_delta_dsi %>%
  filter(authorship == "Regular Human") %>%
  lm(delta_dsi ~ log(n_essay), data =.) %>%
  tab_model()

	delta dsi
Predictors	Estimates	CI	p
(Intercept)	3.00	2.72 – 3.28	<0.001
n essay [log]	0.59	0.49 – 0.69	<0.001
Observations	33
R² / R² adjusted	0.824 / 0.818

Code

s1_delta_dsi %>%
  filter(authorship == "Diverse Human") %>%
  lm(delta_dsi ~ log(n_essay), data =.) %>%
  tab_model()

	delta dsi
Predictors	Estimates	CI	p
(Intercept)	3.47	3.15 – 3.79	<0.001
n essay [log]	0.77	0.65 – 0.88	<0.001
Observations	33
R² / R² adjusted	0.855 / 0.850

Code

s1_delta_dsi %>%
  filter(authorship == "GPT4") %>%
  lm(delta_dsi ~ log(n_essay), data =.) %>%
  tab_model()

	delta dsi
Predictors	Estimates	CI	p
(Intercept)	0.54	0.49 – 0.59	<0.001
n essay [log]	0.04	0.02 – 0.06	<0.001
Observations	33
R² / R² adjusted	0.425 / 0.406