#Abstract: This project will analyze the relationship between presidential polling and news coverage from CNN and Fox news. I would like to understand how news coverage influences a cadidates favorability. My objective is to demonstrate the current trend for the candidates polling, the changes overtime, and check if there is a correlation between news coverage and polling surveyy
#I will use the presidential polling survey data set from https://projects.fivethirtyeight.com/polls/president-general/ and scrape articles containing Biden and Trump from Foxnews.com and CNN.com, for the past 10 months.
library(tidyverse)
library(rvest)
library(xml2)
library(dplyr)
library(ggplot2)
library(jsonlite)
library(lubridate)
library(httr)
library(stringr)
library(tidytext)
library(wordcloud)
library(textdata)
Polls <- read.csv("https://raw.githubusercontent.com/Angelogallardo05/DATA607-final/main/president_polls%20(1).csv", header = TRUE)
# Assuming your data frame is named 'polling_data'
polls_cleaned <- Polls%>%
select_if(~!all(is.na(.))) # Keep columns where not all values are NA
candidate_subset <- subset(polls_cleaned, candidate_name %in% c("Joe Biden", "Donald Trump"))
candidate_subset <- candidate_subset %>%
mutate(start_date = as.Date(start_date, format = "%m/%d/%y"))
# Convert 'start_date' to Date format if needed
candidate_subset$start_date <- as.Date(candidate_subset$start_date)
# Extract month and year from 'start_date'
candidate_subset <- candidate_subset %>%
mutate(month_year = floor_date(start_date, unit = "month")) # Round down to the nearest month
# Calculate average percentage for each candidate within each month
monthly_avg <- candidate_subset %>%
group_by(candidate_name, month_year) %>%
summarise(avg_pct = mean(pct, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'candidate_name'. You can override using
## the `.groups` argument.
# Calculate percentage change based on monthly averages
monthly_avg <- monthly_avg %>%
arrange(candidate_name, month_year) %>%
group_by(candidate_name) %>%
mutate(percentage_change = (avg_pct - lag(avg_pct)) / lag(avg_pct) * 100) %>%
ungroup()
# Plotting using ggplot2
ggplot(monthly_avg, aes(x = month_year, y = percentage_change, color = candidate_name)) +
geom_line() +
geom_point() +
labs(x = "Month", y = "Percentage Change", title = "Monthly Percentage Change of Candidates") +
theme_minimal()
## Warning: Removed 2 rows containing missing values (`geom_line()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
swing_states <- c("Arizona", "Georgia", "Pennsylvania","Michigan","Nevada", "North Carolina", "Wisconsin") # List of swing states
# Filter and aggregate data
filtered_subset <- candidate_subset %>%
filter(year(start_date) >= 2023, state %in% swing_states) %>%
mutate(month_year = floor_date(start_date, "month")) %>% # Round down to month
group_by(state, month_year, candidate_name) %>%
summarize(avg_pct = mean(pct, na.rm = TRUE))
## `summarise()` has grouped output by 'state', 'month_year'. You can override
## using the `.groups` argument.
# Determine the number of unique states
unique_states <- unique(filtered_subset$state)
num_states <- length(unique_states)
# Calculate an appropriate plot height based on the number of states
plot_height <- min(10 * num_states, 120) # Adjust the multiplier (10) based on your preference
options(repr.plot.width = 10, repr.plot.height = plot_height) # Adjust width and height
# Plotting with ggplot
ggplot(filtered_subset, aes(x = month_year, y = avg_pct, color = candidate_name)) +
geom_line() +
labs(x = "Start Date", y = "Percentage (%)", color = "Candidate") +
scale_x_date(date_breaks = "2 months", date_labels = "%b %Y") + # Display every 2 months
theme(
axis.text.x = element_text(size = 8, angle = 45, hjust = 1) # Adjust size for better readability
) +
facet_wrap(~ state, nrow = ceiling(num_states / 3), scales = "free_y") # Adjust nrow for better layout
#Trump cnn articles
trump_cnn <- c(
"https://www.cnn.com/2024/04/30/politics/trump-political-violence-2024-time-interview/index.html",
"https://www.cnn.com/2024/03/23/politics/trump-campaign-courthouse-bond/index.html",
"https://www.cnn.com/2024/02/28/politics/supreme-court-immunity-trump-biskupic-analysis/index.html",
"https://www.cnn.com/2024/01/29/politics/trump-border-middle-east-election/index.html",
"https://www.cnn.com/2023/12/19/politics/donald-trump-extreme-rhetoric-2024-election/index.html",
"https://www.cnn.com/2023/11/21/politics/trump-gag-order-appeal-election-year-stress-test/index.html",
"https://www.cnn.com/2023/10/26/politics/trump-legal-system-rules-2024/index.html",
"https://www.cnn.com/2023/09/26/politics/trump-organization-business-fraud/index.html",
"https://www.cnn.com/2023/08/10/politics/trump-legal-drama-grows/index.html",
"https://www.cnn.com/2023/07/28/politics/trump-classified-documents-case-2024/index.html",
"https://www.cnn.com/2023/06/10/politics/trump-campaign-indictment-georgia-north-carolina/index.html"
)
# Create DataFrame
trump_cnn_df <- data.frame(URL = trump_cnn)
# Create empty lists to store extracted data
titles <- c()
dates <- c()
texts <- c()
presidents <- c() # Create an empty list for the President column
# Loop through each URL in the dataframe
for (i in 1:nrow(trump_cnn_df)) {
url <- trump_cnn_df$URL[i]
# Read HTML content from the URL
page <- read_html(url)
# Extract title
title <- page %>%
html_nodes("title") %>%
html_text() %>%
first() # Take the first element (assuming there's only one title)
titles <- c(titles, title)
# Extract date
date <- page %>%
html_node(xpath = "//*[contains(@class, 'timestamp')]") %>%
html_text() %>%
trimws() # Trim extra spaces
date <- gsub("^[^,]+,", "", date)
dates <- c(dates, date)
# Extract text content from JSON-LD script
json_ld_script <- page %>%
html_nodes("script[type='application/ld+json']") %>%
html_text() %>%
paste(collapse = "\n")
# Parse JSON-LD data
if (nzchar(json_ld_script)) { # Check if JSON-LD script is not empty
json_data <- jsonlite::fromJSON(json_ld_script) # Use jsonlite:: prefix
article_text <- json_data$articleBody
} else {
article_text <- NA
}
texts <- c(texts, article_text)
# Add "Donald Trump" to the presidents list (for each URL)
presidents <- c(presidents, "Donald Trump")
}
# Create a dataframe from extracted data
trump_cnn_data <- data.frame(
URL = trump_cnn_df$URL,
Title = titles,
Date = dates,
Text = texts,
President = presidents, # Include the new "President" column
stringsAsFactors = FALSE # Ensure strings are treated as characters, not factors
)
#Biden cnn articles
biden_cnn <- c(
"https://www.cnn.com/2024/04/28/politics/biden-trump-nostalgia/index.html",
"https://www.cnn.com/2024/03/19/politics/joe-biden-western-swing/index.html",
"https://www.cnn.com/2024/02/22/politics/biden-bully-pulpit-trump-russia/index.html",
"https://www.cnn.com/2024/01/24/politics/joe-biden-campaign-donald-trump/index.html",
"https://www.cnn.com/2023/12/07/politics/biden-compromise-border-security/index.html",
"https://www.cnn.com/2023/11/24/politics/biden-remarks-initial-hostage-release-israel-gaza/index.html",
"https://www.cnn.com/2023/10/19/politics/biden-oval-office-speech-israel-ukraine/index.html",
"https://www.cnn.com/2023/09/18/politics/biden-iran-americans-politics/index.html",
"https://www.cnn.com/2023/08/27/politics/biden-summer-vacation-maui-ukraine-2024-election/index.html",
"https://www.cnn.com/2023/07/28/politics/biden-economy-maine-trump/index.html",
"https://www.cnn.com/2023/06/26/politics/joe-biden-russia/index.html"
)
# Create DataFrame
biden_cnn_df <- data.frame(URL = biden_cnn)
# Create empty lists to store extracted data
titles <- c()
dates <- c()
texts <- c()
presidents <- c() # Create an empty list for the President column
# Loop through each URL in the dataframe
for (i in 1:nrow(biden_cnn_df)) {
url <- biden_cnn_df$URL[i]
# Read HTML content from the URL
page1 <- read_html(url)
# Extract title
title <- page1 %>%
html_nodes("title") %>%
html_text() %>%
first() # Take the first element (assuming there's only one title)
titles <- c(titles, title)
# Extract date
date <- page1 %>%
html_node(xpath = "//*[contains(@class, 'timestamp')]") %>%
html_text() %>%
trimws() # Trim extra spaces
date <- gsub("^[^,]+,", "", date)
dates <- c(dates, date)
# Extract text content from JSON-LD script
json_ld_script <- page1 %>%
html_nodes("script[type='application/ld+json']") %>%
html_text() %>%
paste(collapse = "\n")
# Parse JSON-LD data
if (nzchar(json_ld_script)) { # Check if JSON-LD script is not empty
json_data <- jsonlite::fromJSON(json_ld_script) # Use jsonlite:: prefix
article_text <- json_data$articleBody
} else {
article_text <- NA # Set to NA if JSON-LD script is empty or not found
}
texts <- c(texts, article_text)
# Add "Joe Biden" to the presidents list (for each URL)
presidents <- c(presidents, "Joe Biden")
}
# Create a dataframe from extracted data
biden_cnn_data <- data.frame(
URL = biden_cnn_df$URL,
Title = titles,
Date = dates,
Text = texts,
President = presidents, # Include the new "President" column
stringsAsFactors = FALSE # Ensure strings are treated as characters, not factors
)
combined_cnn_data <- rbind(trump_cnn_data, biden_cnn_data)
# Optional: Reset row names/index of the combined dataframe
rownames(combined_cnn_data) <- NULL
combined_cnn_data <- combined_cnn_data %>%
mutate(Date = mdy(Date))
bing_lexicon <- get_sentiments("bing")
head(bing_lexicon)
#sentimental analysis
tokenized_data <- combined_cnn_data %>%
unnest_tokens(word, Text)
# Join the tokenized data with the bing lexicon to assign sentiment scores
data_with_sentiment <- tokenized_data %>%
inner_join(bing_lexicon, by = c(word = "word"))
combined_cnn_data <- combined_cnn_data %>%
mutate(Date = lubridate::mdy(Date))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Date = lubridate::mdy(Date)`.
## Caused by warning:
## ! All formats failed to parse. No formats found.
# Convert 'Date' to month-year format for aggregation
data_with_sentiment <- data_with_sentiment %>%
mutate(month_year = floor_date(Date, unit = "month"))
# Aggregate sentiment scores by month, president, and sentiment
sentiment_summary <- data_with_sentiment %>%
group_by(President, month_year, sentiment) %>%
summarize(count = n()) %>%
spread(sentiment, count, fill = 0) %>%
mutate(total_words = positive + negative,
positive_pct = positive / total_words * 100,
negative_pct = negative / total_words * 100) %>%
select(President, month_year, positive_pct, negative_pct)
## `summarise()` has grouped output by 'President', 'month_year'. You can override
## using the `.groups` argument.
#Sentiment scores from cnn
sentiment_scores <- tokenized_data %>%
inner_join(bing_lexicon, by = "word") %>%
count(President, sentiment) %>%
spread(sentiment, n, fill = 0)
# Calculate sentiment percentages
sentiment_scores <- sentiment_scores %>%
group_by(President) %>%
mutate(total_words = sum(positive, negative),
positive_percent = (positive / total_words) * 100,
negative_percent = (negative / total_words) * 100) %>%
select(President, positive_percent, negative_percent)
print(sentiment_scores)
## # A tibble: 2 × 3
## # Groups: President [2]
## President positive_percent negative_percent
## <chr> <dbl> <dbl>
## 1 Donald Trump 45.5 54.5
## 2 Joe Biden 50.5 49.5
presidents <- unique(sentiment_summary$President)
for (president in presidents) {
# Filter data for the current president
president_data <- sentiment_summary %>%
filter(President == president)
# Create line plot for positive and negative sentiment
plot <- ggplot(president_data, aes(x = month_year)) +
geom_line(aes(y = positive_pct, color = "Positive")) +
geom_line(aes(y = negative_pct, color = "Negative")) +
labs(title = paste("Sentiment Analysis for", president),
x = 'Month-Year',
y = 'Percentage',
color = 'Sentiment') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_color_manual(values = c("red", "blue"),
labels = c("Negative", "Positive"))
print(plot)
}
trump_fox <- c(
"https://www.foxnews.com/politics/trump-leads-biden-important-issues-voters-2024-election-polls-show",
"https://www.foxnews.com/opinion/issue-cost-trump-2024-campaign",
"https://www.foxnews.com/politics/challenging-biden-debates-trumps-tells-ingraham-ill-do-it-right-now-your-show",
"https://www.foxnews.com/politics/trump-starts-2024-in-strongest-possible-position-republican-presidential-primary-race",
"https://www.foxnews.com/media/2023-year-liberal-outlets-attacked-platforming-trump",
"https://www.foxnews.com/media/trump-vs-biden-dramatic-difference-media-treat-each-campaign",
"https://www.foxnews.com/politics/trump-accept-speakership-for-short-period-republicans-decide-permanent-replacement",
"https://www.foxnews.com/politics/trumps-race-lose-four-half-months-ahead-first-votes-gop-presidential-nomination",
"https://www.foxnews.com/politics/trump-indicted-fourth-time-he-leads-2024-primary-field",
"https://www.foxnews.com/politics/trump-reacts-to-abuse-by-doj-this-is-what-you-get-for-leading-the-2024-polls",
"https://www.foxnews.com/media/trump-calls-nara-radical-left-group-couldve-stuffed-boxes-during-mar-a-lago-document-raid"
)
# Create DataFrame
trump_fox_df <- data.frame(URL = trump_fox)
# Create empty lists to store extracted data
titles <- c()
dates <- c()
texts <- c()
presidents <- c()
# Loop through each URL in the dataframe
for (i in 1:nrow(trump_fox_df)) {
url <- trump_fox_df$URL[i]
# Read HTML content from the URL
page <- read_html(url)
# Extract title
title <- page %>%
html_nodes("title") %>%
html_text() %>%
first() # Take the first element (assuming there's only one title)
# Extract date
date <- page %>%
html_node("time") %>%
html_text() %>%
trimws() # Trim extra spaces
# Extract text content from <p class="speakable"> elements
text_elements <- page %>%
html_nodes("p.speakable") # Select all <p> elements with class "speakable"
if (length(text_elements) > 0) {
# Extract text from each <p class="speakable"> element
article_text <- sapply(text_elements, function(elem) {
elem_text <- elem %>% html_text() %>% trimws()
if (nzchar(elem_text)) {
return(elem_text)
} else {
return(NA)
}
})
} else {
article_text <- NA # Set to NA if no text elements found
}
# Add extracted data to respective lists
titles <- c(titles, title)
dates <- c(dates, date)
texts <- c(texts, article_text)
presidents <- c(presidents, "Donald Trump") # Add "Donald Trump" as President
# Debugging: Print extracted data for each URL
cat("Processed URL:", url, "\n")
cat("Title:", title, "\n")
cat("Date:", date, "\n")
cat("Text Length:", length(article_text), "\n")
cat("\n")
}
# Create a dataframe from extracted data
trump_fox_data <- data.frame(
URL = trump_fox_df$URL,
Title = titles,
Date = dates,
Text = texts,
President = presidents,
stringsAsFactors = FALSE
)
#Articles containing Biden on Fox news
biden_fox <- c(
"https://www.foxnews.com/politics/new-poll-biden-2024-lead-vanishing-trump-trial",
"https://www.foxnews.com/politics/locking-it-up-biden-clinches-2024-democrat-presidential-nomination-during-tuesdays-primaries",
"https://www.foxnews.com/politics/biden-making-unannounced-visit-walter-reed-physical-exam",
"https://www.foxnews.com/us/biden-brought-chaos-trump-says-hell-bring-success-hunter-biden-heads-court-more-top-headlines",
"https://www.foxnews.com/politics/bidens-polling-problem-running-re-election-2024-president-ends-2023-underwater",
"https://www.foxnews.com/politics/white-house-quietly-walks-back-bidens-comment-adding-conditions-sending-assistance-israel",
"https://www.foxnews.com/politics/biden-shuns-calls-de-escalate-vows-us-israel-back-prepares-ground-war-hamas",
"https://www.foxnews.com/politics/fox-news-politics-biden-impeachment",
"https://www.foxnews.com/politics/biden-family-huddles-in-secrecy-exclusive-lake-tahoe-home-special-counsel-investigation-hunter",
"https://www.foxnews.com/politics/white-house-ignores-republicans-demanding-answers-bidens-knowledge-hunter-bidens-business-dealings",
"https://www.foxnews.com/politics/biden-admits-not-big-abortion-terminations-last-three-months-negotiated"
)
# Create DataFrame
biden_fox_df <- data.frame(URL = biden_fox)
# Create empty lists to store extracted data
titles <- c()
dates <- c()
texts <- c()
presidents <- c()
# Loop through each URL in the dataframe
for (i in 1:nrow(biden_fox_df)) {
url <- biden_fox_df$URL[i]
# Read HTML content from the URL
page <- read_html(url)
# Extract title
title <- page %>%
html_nodes("title") %>%
html_text() %>%
first() # Take the first element (assuming there's only one title)
# Extract date
date <- page %>%
html_node("time") %>%
html_text() %>%
trimws() # Trim extra spaces
# Extract text content from <p class="speakable"> elements
text_elements <- page %>%
html_nodes("p.speakable") # Select all <p> elements with class "speakable"
if (length(text_elements) > 0) {
# Extract text from each <p class="speakable"> element
article_text <- sapply(text_elements, function(elem) {
elem_text <- elem %>% html_text() %>% trimws()
if (nzchar(elem_text)) {
return(elem_text)
} else {
return(NA)
}
})
} else {
article_text <- NA # Set to NA if no text elements found
}
# Add extracted data to respective lists
titles <- c(titles, title)
dates <- c(dates, date)
texts <- c(texts, article_text)
presidents <- c(presidents, "Joe Biden") # Add "Biden" as President
# Debugging: Print extracted data for each URL
cat("Processed URL:", url, "\n")
cat("Title:", title, "\n")
cat("Date:", date, "\n")
cat("Text Length:", length(article_text), "\n")
cat("\n")
}
# Create a dataframe from extracted data
biden_fox_data <- data.frame(
URL = trump_fox_df$URL,
Title = titles,
Date = dates,
Text = texts,
President = presidents,
stringsAsFactors = FALSE
)
# Combine the dataframes using rbind
combined_fox_data <- rbind(trump_fox_data, biden_fox_data)
rownames(combined_fox_data) <- NULL
glimpse(combined_fox_data)
## Rows: 44
## Columns: 5
## $ URL <chr> "https://www.foxnews.com/politics/trump-leads-biden-importan…
## $ Title <chr> "New poll reveals voters prefer Trump on these key issues | …
## $ Date <dttm> 2024-04-22 19:03:00, 2024-03-13 05:00:00, 2024-02-20 20:00:…
## $ Text <chr> "Most voters prefer former President Donald Trump to handle …
## $ President <chr> "Donald Trump", "Donald Trump", "Donald Trump", "Donald Trum…
fox_tokenized_data <- combined_fox_data %>%
unnest_tokens(word, Text)
# Join the tokenized data with the bing lexicon to assign sentiment scores
fox_data_with_sentiment <- fox_tokenized_data %>%
inner_join(bing_lexicon, by = c(word = "word"))
#Fox news sentiment nalysis
# Convert 'Date' to month-year format for aggregation
fox_data_with_sentiment <- fox_data_with_sentiment %>%
mutate(month_year = floor_date(Date, unit = "month"))
# Aggregate sentiment scores by month, president, and sentiment
fox_sentiment_summary <- fox_data_with_sentiment %>%
group_by(President, month_year, sentiment) %>%
summarize(count = n()) %>%
spread(sentiment, count, fill = 0) %>%
mutate(total_words = positive + negative,
positive_pct = positive / total_words * 100,
negative_pct = negative / total_words * 100) %>%
select(President, month_year, positive_pct, negative_pct)
# Check the structure of sentiment_summary
print(fox_sentiment_summary)
#fox sentiment scores. Here we see that Fox news is indeed very favorable towards Trump while impartial towards Biden.
fox_sentiment_scores <- fox_tokenized_data %>%
inner_join(bing_lexicon, by = "word") %>%
count(President, sentiment) %>%
spread(sentiment, n, fill = 0)
# Calculate sentiment percentages
fox_sentiment_scores <- fox_sentiment_scores %>%
group_by(President) %>%
mutate(total_words = sum(positive, negative),
positive_percent = (positive / total_words) * 100,
negative_percent = (negative / total_words) * 100) %>%
select(President, positive_percent, negative_percent)
print(fox_sentiment_scores)
## # A tibble: 2 × 3
## # Groups: President [2]
## President positive_percent negative_percent
## <chr> <dbl> <dbl>
## 1 Donald Trump 65.7 34.3
## 2 Joe Biden 50 50
f_presidents <- unique(fox_sentiment_summary$President)
for (president in f_presidents) {
# Filter data for the current president
president_data <- fox_sentiment_summary %>%
filter(President == president)
# Create line plot for positive and negative sentiment
plot <- ggplot(president_data, aes(x = month_year)) +
geom_line(aes(y = positive_pct, color = "Positive")) +
geom_line(aes(y = negative_pct, color = "Negative")) +
labs(title = paste("Sentiment Analysis for", president),
x = 'Month-Year',
y = 'Percentage',
color = 'Sentiment') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_color_manual(values = c("red", "blue"),
labels = c("negative", "positive"))
# Print the plot
print(plot)
}
glimpse(fox_sentiment_summary)
glimpse(sentiment_summary)
glimpse(monthly_avg)
combined_sentiments <- bind_rows(data_with_sentiment, fox_data_with_sentiment)
glimpse(combined_sentiments)
#Negative wordclodes for each president over time
negative_words <- combined_sentiments %>%
filter(sentiment == "negative")
# Group by President and concatenate negative words
negative_words <- negative_words %>%
group_by(President) %>%
summarise(negative_text = paste(word, collapse = " "))
# Generate word cloud for each President
for (i in 1:nrow(negative_words)) {
president <- negative_words$President[i]
text <- negative_words$negative_text[i]
# Clean the text (optional): remove punctuation, numbers, and extra spaces
clean_text <- str_replace_all(text, "[^[:alpha:][:space:]]", "")
# Create word cloud
wordcloud(words = unlist(str_split(clean_text, "\\s+")),
min.freq = 1,
max.words = 50,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"),
main = paste("Negative Word Cloud for", president))
}
## Loading required namespace: tm
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
#It looks like there a positive correlation between positive news coverage and polling surveys. In addition, there does seem to be some bias in coverage from both medias. The negative sentiments can provide clues as to what voters may seem as an important issue. We sew the Trump is currently leading in the average presidential polling. The positive coverege from both networks could have assisted with that. More importantly the swing states seem to be at partially influence by the news coverage, whether they view Fox news or Cnn. Finally,I believe that the sentiment analysis can help the presidential campaign understand how each news agency is covering the candidate.It can help them strategize on how to change the topics of the candidate and use social media to defend againd the negative coverage. Also, this could help voters understand the apporach in which their preferred news is covering a candidate. Maybe other news networks are covering the presidential campaings fairly.
#I believe I could have included a neutral sentiment score to understand when the news outlets were being impartial towards the candidate. I also could have scraped more news articles.