Purpose

This R script uses the GDELT database to compare the daily volume of “Donald Trump” and “Joe Biden” coverage in U.S.-based English-language news stories published online between two user-customizable dates.

For both Trump and Biden, daily volume is expressed as the percentage of all monitored news articles that include the man’s full name. Note that the volume counts would include any articles that mention other people named “Donald Trump” or “Joe Biden,” including “Donald Trump Jr.,” a son of former President Donald Trump.

#Installing and loading the tidyverse package
if (!require("tidyverse"))
  install.packages("tidyverse")
if (!require("dplyr"))
  install.packages("dplyr")
if (!require("plotly"))
  install.packages("plotly")
if (!require("readr"))
  install.packages("readr")
library(tidyverse)
library(dplyr)
library(plotly)
library(ggplot2)
library(readr)

### Date range
startdate <- "20220101" #Customizable
enddate <- "20230829" #Customizable

### Trump
query <- "'Donald Trump' SourceCountry:US"
#Building the Volume dataframe
vp1 <- "https://api.gdeltproject.org/api/v2/doc/doc?query="
vp2 <- "&mode=timelinevolinfo&startdatetime="
vp3 <- "000000&enddatetime="
vp4 <- "000000&format=CSV"
text_v_url <- paste0(vp1, query, vp2, startdate, vp3, enddate, vp4)
v_url <- URLencode(text_v_url)
v_url
Volume <- read_csv(v_url)
Volume$Date <- as.Date(Volume$Date, "%Y-%m-%d")
VolumeTrump <- Volume

### Biden
query <- "'Joe Biden' SourceCountry:US"
#Building the Volume dataframe
vp1 <- "https://api.gdeltproject.org/api/v2/doc/doc?query="
vp2 <- "&mode=timelinevolinfo&startdatetime="
vp3 <- "000000&enddatetime="
vp4 <- "000000&format=CSV"
text_v_url <- paste0(vp1, query, vp2, startdate, vp3, enddate, vp4)
v_url <- URLencode(text_v_url)
v_url
Volume <- read_csv(v_url)
Volume$Date <- as.Date(Volume$Date, "%Y-%m-%d")
VolumeBiden <- Volume

### Merging
VolumeTrumpBiden <- merge(VolumeTrump, VolumeBiden, by = "Date")
VolumeTrumpBiden$TrumpVolume <- VolumeTrumpBiden$Value.x
VolumeTrumpBiden$BidenVolume <- VolumeTrumpBiden$Value.y

#Plotting volume by date
library(plotly)
fig <- plot_ly(
  VolumeTrumpBiden,
  x = ~ Date,
  y = ~ BidenVolume,
  name = 'Biden',
  type = 'scatter',
  mode = 'lines',
  line = list(color = "#005F73")
)
fig <- fig %>% add_trace(
  y = ~ TrumpVolume,
  name = 'Trump',
  mode = 'lines',
  line = list(color = "#AE2012")
)
fig <-
  fig %>% layout(
    title = 'U.S. coverage volume, Biden v. Trump',
    xaxis = list(title = "Date",
                 showgrid = FALSE),
    yaxis = list(title = "Volume",
                 showgrid = TRUE)
  )
fig


### Saving the data to a local .csv file
write_csv(VolumeTrumpBiden, "VolumeTrumpBiden.csv")

# Paired-samples t-test

# Read the data
mydata <- VolumeTrumpBiden

# Specify the two variables involved
mydata$V1 <- mydata$TrumpVolume
mydata$V2 <- mydata$BidenVolume

# Look at the distribution of the pair differences
mydata$PairDifferences <- mydata$V2 - mydata$V1

ggplot(mydata, aes(x = PairDifferences)) +
  geom_histogram(color = "black", fill = "dodgerblue") +
  geom_vline(aes(xintercept = mean(PairDifferences)))

# Get descriptive statistics for pair differences
mydata %>%
  select(PairDifferences) %>%
  summarise(
    count = n(),
    mean = mean(PairDifferences, na.rm = TRUE),
    sd = sd(PairDifferences, na.rm = TRUE),
    min = min(PairDifferences, na.rm = TRUE),
    max = max(PairDifferences, na.rm = TRUE)
  )

# If pair differences look non-normal, you can use a Shapiro-Wilk test to check
# whether their distribution differs significantly from normal. If the
# Shapiro-Wilk test p-value is less than 0.05, #use a Wilcoxon signed rank test
# instead of a paired-samples t-test.

# Shapiro-Wilk test
# options(scipen = 999)
# shapiro.test(mydata$PairDifferences)

# If the pair distribution is non-normal, consider # using a Wilcoxon signed rank test instead of a
# paired-samples t-test.

# mydata %>%
#   select(V1, V2) %>%
#   summarise_all(list(Mean = mean, SD = sd))
# wilcox.test(mydata$V1, mydata$V2, paired = TRUE)

# If the pair differences are normally distributed,
# though, you may use a paired-samples t-test.

mydata$Trump <- mydata$V1
mydata$Biden <- mydata$V2

mydata %>%
  select(Trump, Biden) %>%
  summarise_all(list(Mean = mean, SD = sd))
options(scipen = 999)
t.test(mydata$Biden, mydata$Trump,
       paired = TRUE)
mydata %>%
  select(Trump, Biden) %>%
  summarise_all(list(Mean = mean, SD = sd))

Trump and Biden coverage volume demo

Dr. Ken Blake

2023-08-29

Purpose