Here’s an occasionally updated comparison of how many online U.S. news articles per day mention Donald Trump and how many mention Kamala Harris. The data come from GDELT, the Global Database of Events, Language and Tone. You can explore other coverage trends via GDELT’s News Comparer.
A paired-samples t-test can indicate whether the difference between the volumes is statistically significant. A p value of 0.05 or less suggests significance. Note, though, that if the gray histogram appears non-normal, a paired-sample t-test may not be suitable.
## count mean sd min max
## 1 131 -1089.733 1342.318 -7228 746
## V1_Mean V2_Mean V1_SD V2_SD
## 1 2443.908 1354.176 1352.086 1101.926
##
## Paired t-test
##
## data: mydata$V2 and mydata$V1
## t = -9.2918, df = 130, p-value = 0.0000000000000004587
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -1321.7551 -857.7106
## sample estimates:
## mean difference
## -1089.733
#Installing and loading the tidyverse package
if (!require("tidyverse"))
install.packages("tidyverse")
if (!require("dplyr"))
install.packages("dplyr")
if (!require("plotly"))
install.packages("plotly")
if (!require("readr"))
install.packages("readr")
library(tidyverse)
library(dplyr)
library(plotly)
library(ggplot2)
library(readr)
### Date range
startdate <- "20240601"
enddate <- "20240927"
### Topic A
query <- "'Donald Trump' SourceCountry:US"
#Building the Volume dataframe
vp1 <- "https://api.gdeltproject.org/api/v2/doc/doc?query="
vp2 <- "&mode=timelinevolraw&startdatetime="
vp3 <- "000000&enddatetime="
vp4 <- "000000&format=CSV"
text_v_url <- paste0(vp1, query, vp2, startdate, vp3, enddate, vp4)
v_url <- URLencode(text_v_url)
v_url
Volume <- read_csv(v_url)
Volume$Date <- as.Date(Volume$Date, "%Y-%m-%d")
Volume <- Volume %>%
filter(Series == "Article Count")
VolumeA <- Volume
### Topic B
query <- "'Kamala Harris' SourceCountry:US"
#Building the Volume dataframe
vp1 <- "https://api.gdeltproject.org/api/v2/doc/doc?query="
vp2 <- "&mode=timelinevolraw&startdatetime="
vp3 <- "000000&enddatetime="
vp4 <- "000000&format=CSV"
text_v_url <- paste0(vp1, query, vp2, startdate, vp3, enddate, vp4)
v_url <- URLencode(text_v_url)
v_url
Volume <- read_csv(v_url)
Volume$Date <- as.Date(Volume$Date, "%Y-%m-%d")
Volume <- Volume %>%
filter(Series == "Article Count")
VolumeB <- Volume
### Merging
VolumeAB <- merge(VolumeA, VolumeB, by = "Date")
VolumeAB$VolumeA <- VolumeAB$Value.x
VolumeAB$VolumeB <- VolumeAB$Value.y
#Plotting volume by date
library(plotly)
fig <- plot_ly(
VolumeAB,
x = ~ Date,
y = ~ VolumeA,
name = 'Trump', # <= Enter label for series A
type = 'scatter',
mode = 'lines',
line = list(color = "#AE2012") # <= Enter color for series A
)
fig <- fig %>% add_trace(
y = ~ VolumeB,
name = 'Harris', # <= Enter label for series B
mode = 'lines',
line = list(color = "#005F73") # <= Enter color for series B
)
fig <-
fig %>% layout(
title = 'U.S. coverage volume',
xaxis = list(title = "Date",
showgrid = FALSE),
yaxis = list(title = "Volume",
showgrid = TRUE)
)
fig
### Saving the data to a local .csv file
write_csv(VolumeAB, "VolumeAB.csv")
# Paired-samples t-test
# Read the data
mydata <- VolumeAB
# Specify the two variables involved
mydata$V1 <- mydata$VolumeA
mydata$V2 <- mydata$VolumeB
# Look at the distribution of the pair differences
mydata$PairDifferences <- mydata$V2 - mydata$V1
ggplot(mydata, aes(x = PairDifferences)) +
geom_histogram(color = "black", fill = "dodgerblue") +
geom_vline(aes(xintercept = mean(PairDifferences)))
# Get descriptive statistics for pair differences
mydata %>%
select(PairDifferences) %>%
summarise(
count = n(),
mean = mean(PairDifferences, na.rm = TRUE),
sd = sd(PairDifferences, na.rm = TRUE),
min = min(PairDifferences, na.rm = TRUE),
max = max(PairDifferences, na.rm = TRUE)
)
# If pair differences look non-normal, you can use a Shapiro-Wilk test to check
# whether their distribution differs significantly from normal. If the
# Shapiro-Wilk test p-value is less than 0.05, #use a Wilcoxon signed rank test
# instead of a paired-samples t-test.
# Shapiro-Wilk test
# options(scipen = 999)
# shapiro.test(mydata$PairDifferences)
# If the pair distribution is non-normal, consider # using a Wilcoxon signed rank test instead of a
# paired-samples t-test.
# mydata %>%
# select(V1, V2) %>%
# summarise_all(list(Mean = mean, SD = sd))
# wilcox.test(mydata$V1, mydata$V2, paired = TRUE)
# If the pair differences are normally distributed,
# though, you may use a paired-samples t-test.
mydata %>%
select(V1, V2) %>%
summarise_all(list(Mean = mean, SD = sd))
options(scipen = 999)
t.test(mydata$V2, mydata$V1,
paired = TRUE)