Here’s an occasionally updated comparison of how many online U.S. news articles per day mention Donald Trump and how many mention Kamala Harris. The data come from GDELT, the Global Database of Events, Language and Tone. You can explore other coverage trends via GDELT’s News Comparer.


Jun 2024Jul 2024Aug 2024Sep 2024Oct 2024010002000300040005000600070008000
TrumpHarrisU.S. coverage volumeDateVolume

A paired-samples t-test can indicate whether the difference between the volumes is statistically significant. A p value of 0.05 or less suggests significance. Note, though, that if the gray histogram appears non-normal, a paired-sample t-test may not be suitable.

##   count      mean       sd   min max
## 1   131 -1089.733 1342.318 -7228 746
##    V1_Mean  V2_Mean    V1_SD    V2_SD
## 1 2443.908 1354.176 1352.086 1101.926
## 
##  Paired t-test
## 
## data:  mydata$V2 and mydata$V1
## t = -9.2918, df = 130, p-value = 0.0000000000000004587
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -1321.7551  -857.7106
## sample estimates:
## mean difference 
##       -1089.733

Here’s the R script for producing the above output

#Installing and loading the tidyverse package
if (!require("tidyverse"))
  install.packages("tidyverse")
if (!require("dplyr"))
  install.packages("dplyr")
if (!require("plotly"))
  install.packages("plotly")
if (!require("readr"))
  install.packages("readr")
library(tidyverse)
library(dplyr)
library(plotly)
library(ggplot2)
library(readr)

### Date range
startdate <- "20240601"
enddate <- "20240927"

### Topic A
query <- "'Donald Trump' SourceCountry:US"
#Building the Volume dataframe
vp1 <- "https://api.gdeltproject.org/api/v2/doc/doc?query="
vp2 <- "&mode=timelinevolraw&startdatetime="
vp3 <- "000000&enddatetime="
vp4 <- "000000&format=CSV"
text_v_url <- paste0(vp1, query, vp2, startdate, vp3, enddate, vp4)
v_url <- URLencode(text_v_url)
v_url
Volume <- read_csv(v_url)
Volume$Date <- as.Date(Volume$Date, "%Y-%m-%d")
Volume <- Volume %>% 
  filter(Series == "Article Count")
VolumeA <- Volume

### Topic B
query <- "'Kamala Harris' SourceCountry:US"
#Building the Volume dataframe
vp1 <- "https://api.gdeltproject.org/api/v2/doc/doc?query="
vp2 <- "&mode=timelinevolraw&startdatetime="
vp3 <- "000000&enddatetime="
vp4 <- "000000&format=CSV"
text_v_url <- paste0(vp1, query, vp2, startdate, vp3, enddate, vp4)
v_url <- URLencode(text_v_url)
v_url
Volume <- read_csv(v_url)
Volume$Date <- as.Date(Volume$Date, "%Y-%m-%d")
Volume <- Volume %>% 
  filter(Series == "Article Count")
VolumeB <- Volume

### Merging
VolumeAB <- merge(VolumeA, VolumeB, by = "Date")
VolumeAB$VolumeA <- VolumeAB$Value.x
VolumeAB$VolumeB <- VolumeAB$Value.y

#Plotting volume by date
library(plotly)
fig <- plot_ly(
  VolumeAB,
  x = ~ Date,
  y = ~ VolumeA,
  name = 'Trump', # <= Enter label for series A
  type = 'scatter',
  mode = 'lines',
  line = list(color = "#AE2012") # <= Enter color for series A 
)
fig <- fig %>% add_trace(
  y = ~ VolumeB,
  name = 'Harris', # <= Enter label for series B
  mode = 'lines',
  line = list(color = "#005F73") # <= Enter color for series B
)
fig <-
  fig %>% layout(
    title = 'U.S. coverage volume',
    xaxis = list(title = "Date",
                 showgrid = FALSE),
    yaxis = list(title = "Volume",
                 showgrid = TRUE)
  )
fig


### Saving the data to a local .csv file
write_csv(VolumeAB, "VolumeAB.csv")

# Paired-samples t-test

# Read the data
mydata <- VolumeAB

# Specify the two variables involved
mydata$V1 <- mydata$VolumeA
mydata$V2 <- mydata$VolumeB

# Look at the distribution of the pair differences
mydata$PairDifferences <- mydata$V2 - mydata$V1

ggplot(mydata, aes(x = PairDifferences)) +
  geom_histogram(color = "black", fill = "dodgerblue") +
  geom_vline(aes(xintercept = mean(PairDifferences)))

# Get descriptive statistics for pair differences
mydata %>%
  select(PairDifferences) %>%
  summarise(
    count = n(),
    mean = mean(PairDifferences, na.rm = TRUE),
    sd = sd(PairDifferences, na.rm = TRUE),
    min = min(PairDifferences, na.rm = TRUE),
    max = max(PairDifferences, na.rm = TRUE)
  )

# If pair differences look non-normal, you can use a Shapiro-Wilk test to check
# whether their distribution differs significantly from normal. If the
# Shapiro-Wilk test p-value is less than 0.05, #use a Wilcoxon signed rank test
# instead of a paired-samples t-test.

# Shapiro-Wilk test
# options(scipen = 999)
# shapiro.test(mydata$PairDifferences)

# If the pair distribution is non-normal, consider # using a Wilcoxon signed rank test instead of a
# paired-samples t-test.

# mydata %>%
#   select(V1, V2) %>%
#   summarise_all(list(Mean = mean, SD = sd))
# wilcox.test(mydata$V1, mydata$V2, paired = TRUE)

# If the pair differences are normally distributed,
# though, you may use a paired-samples t-test.

mydata %>%
  select(V1, V2) %>%
  summarise_all(list(Mean = mean, SD = sd))
options(scipen = 999)
t.test(mydata$V2, mydata$V1,
       paired = TRUE)