Background : Can bias in news reporting be measured with basic text analytics?
Goal : We are going to explore daily news feeds and see if we can detect variances in how various news sources report or don't report stories.
Can text analytics make sense of this?
Note : This article, like most of mine on R-Pubs, is both instructional and investigative. You can re-create any of these charts and tables from the embedded R code. Click the "Code" button to view.
QuietLoad <- function(library) {
suppressWarnings(suppressPackageStartupMessages(
library(library, character.only=TRUE)))
}
# Load libraries
QuietLoad('tidyverse')
QuietLoad('odbc')
QuietLoad('DBI')
QuietLoad('clipr')
QuietLoad('tidyRSS')
QuietLoad('rvest')
QuietLoad('tidytext')
QuietLoad('topicmodels')
QuietLoad('kableExtra')
QuietLoad('httr')
QuietLoad('xml2')
QuietLoad('XML')
QuietLoad('jsonlite')
QuietLoad('knitr')
QuietLoad('DBI')
QuietLoad('bigrquery')
First we will gather news articles from six sources. Two that are thought to have a left leaning bias, two from the center and two that are thought to have a right leaning bias.
Left : HUF (Huffington Post) and CBS (CBS News)
Center : BBC (BBC America) and USA (USA Today)
Right : FOX (Fox News) and WTM (Washington Times)
To keep the topics focused, the US headline news feeds were used.
The bias assumption was drawn from my own experience plus the categorization provided by AllSides.
There will be three elements to our bias analysis.
Exclusionary Bias : Topics and the news change daily but which topics are reported more, less or not at all by each side?
Sentiment Bias : Is each side reporting certain topics in a more positive or more negative tone than the other side?
Keyword Bias : Are articles with that report on certain keywords reported more, less or not at all by each side and what is the sentiment of articles with those keywords?
# Define our news source RSS links along with meta data
# about how we will filter and retrieve the data
FeedURLDF <- data.frame(
FeedURL = c("http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml",
"https://www.cbsnews.com/latest/rss/us",
"http://feeds.foxnews.com/foxnews/national",
"https://www.huffpost.com/section/front-page/feed",
"http://rssfeeds.usatoday.com/UsatodaycomNation-TopStories",
"https://www.washingtontimes.com/rss/headlines/news/national/"),
FeedName = c("BBC",
"CBS",
"FOX",
"HUF",
"USA",
"WTM"),
FeedType = factor(c("Center",
"Left",
"Right",
"Left",
"Center",
"Right"), levels = c("Left", "Center", "Right")),
FeedFilter = c("av|podcast|sport",
"video|podcast",
"podcast",
"podcast",
"videos|picture-gallery",
NA),
NodeType = c("xpath",
"css",
"css",
"css",
"css",
"css"),
FeedNode = c('//*[@id="main-content"]',
".content__body",
"div.article-body",
NA, #".entry__text",
NA, #"div.gnt_ar_b",
"div.storyareawrapper"),
XCludeNode = c('//*[@data-component="topStories"]|//*[@data-component="features"]|
//*[@data-component="mostRead"]|//*[@data-component="see-alsos"]',
NA,
"div.info",
NA,
NA,
"div.article-toplinks"),
ReplaceExp = c("\\.css.*?;\\}|\\@media.*?\\}",
"BulaBulla",
"BulaBulla",
"BulaBulla",
"BulaBulla",
"BulaBulla"),
stringsAsFactors = FALSE)
# Build Article Link Data Frame
# - Titles and reference links without the full article
for (i in 1:nrow(FeedURLDF)) {
TempDF <- tidyfeed(FeedURLDF$FeedURL[i]) %>%
dplyr::select(feed_title, item_title, item_link, item_description,
item_pub_date, item_guid) %>%
mutate(item_articletext = NA,
item_NodeType = FeedURLDF$NodeType[i],
item_FeedNode = FeedURLDF$FeedNode[i],
item_XCludeNode = FeedURLDF$XCludeNode[i],
item_ReplaceExp = FeedURLDF$ReplaceExp[i],
feed_type = FeedURLDF$FeedType[i],
feed_title = FeedURLDF$FeedName[i]) %>%
# General filter for blank article descriptions (CNN local news links)
filter(!is.na(item_description) & str_length(item_description) > 1) %>%
# Filter for Recent articles
filter(as.Date(item_pub_date) >= Sys.Date() - 5) %>%
# Specific filters for link URL's (Excludes video for CBS for example)
filter(is.na(FeedURLDF$FeedFilter[i]) |
!str_detect(item_link, FeedURLDF$FeedFilter[i])) %>%
filter(is.na(FeedURLDF$FeedFilter[i]) |
!str_detect(item_guid, FeedURLDF$FeedFilter[i]))
if (i == 1) {
ArticleDF <- TempDF
} else {
ArticleDF <- bind_rows(ArticleDF, TempDF)
}
}
# News sources have different article counts. These next steps will
# normalize the article count per source in 4 steps
# Step 1 : Number and Index by Feed Source
ArticleDF <- ArticleDF %>%
group_by(feed_title) %>%
mutate(item_index = row_number(),
feed_count = n()) %>%
ungroup()
# Step 2 : Summarize the Article DF with counts of articles by source
SelectorDF <- ArticleDF %>%
group_by(feed_title) %>%
summarise(feed_count = min(feed_count), .groups = "drop")
MinCount <- min(SelectorDF$feed_count)
# Step 3 : Using the Selector DF, create a random Selector Index vector
for (i in 1:nrow(SelectorDF)) {
if (i==1) {
FinalSelectorDF <- tibble(feed_title = rep(SelectorDF$feed_title[i], SelectorDF$feed_count[i]),
SelectorIndex = sample(1:SelectorDF$feed_count[i],
size = SelectorDF$feed_count[i]))
} else {
FinalSelectorDF <- bind_rows(FinalSelectorDF,
tibble(feed_title = rep(SelectorDF$feed_title[i], SelectorDF$feed_count[i]),
SelectorIndex = sample(1:SelectorDF$feed_count[i],
size = SelectorDF$feed_count[i])))
}
}
FinalSelectorDF <- FinalSelectorDF %>%
filter(SelectorIndex <= MinCount)
# Step 4 - : Filter each source down to the minimum source count using the
# selector index
ArticleDF <- FinalSelectorDF %>%
inner_join(ArticleDF, by = c("feed_title" = "feed_title", "SelectorIndex" = "item_index"))
# Retrieve the full article detail from each link
# - Uses the meta data to determine how to retrieve and filter
#i <- 37
for (i in 1:nrow(ArticleDF)) {
#for (i in 52:68) {
tryCatch(
if (ArticleDF$item_NodeType[i]=="css") {
TempText <- ArticleDF$item_link[i] %>%
read_html()
if (is.na(ArticleDF$item_FeedNode[i])) {
# if item_FeedNode is NA then no need to filter nodes; we'll just take
# all <p> later
} else {
TempText <- TempText %>%
html_nodes(css = ArticleDF$item_FeedNode[i])
}
if (!is.na(ArticleDF$item_XCludeNode[i])) {
TempXcludeText <- TempText %>%
html_nodes(css = ArticleDF$item_XCludeNode[i])
xml_remove(TempXcludeText)
}
TempText <- TempText %>%
html_nodes("p") %>%
html_text() %>%
toString() %>%
str_replace_all(ArticleDF$item_ReplaceExp[i], "")
} else {
TempText <- ArticleDF$item_link[i] %>%
read_html() %>%
html_nodes(xpath = ArticleDF$item_FeedNode[i])
if (!is.na(ArticleDF$item_XCludeNode[i])) {
TempXcludeText <- TempText %>%
html_nodes(xpath = ArticleDF$item_XCludeNode[i])
xml_remove(TempXcludeText)
}
TempText <- TempText %>%
html_nodes("p") %>%
html_text() %>%
toString() %>%
str_replace_all(ArticleDF$item_ReplaceExp[i], "")
}
, error = function(cond) {
print(cond)
TempText <- ""
}
)
ArticleDF$item_articletext[i] <- TempText
# print(paste0("Processing : ", i, " of ", nrow(ArticleDF), " ",
# ArticleDF$feed_title[i], " length : ", str_length(TempText)))
}
#
custom_stop_words = bind_rows(
tibble(
word = c("ap", "news", "week", "fox", "cnn", "cbs",
"bbc", "huffington", "huff", "post", "washington",
"colstrip")),
stop_words %>%
filter(lexicon != "NO SUCH LEXICON") %>%
dplyr::select(word)) %>%
mutate(word = SnowballC::wordStem(word))
ArticleTokenDF <- ArticleDF %>%
dplyr::select(feed_title, feed_type, item_link, item_title,
item_description, item_articletext) %>%
unnest_tokens(word, item_articletext) %>%
mutate(word = SnowballC::wordStem(word)) %>%
filter(!str_detect(word, "\\d|\\,")) %>%
anti_join(custom_stop_words, by = "word") %>%
group_by(item_link, word) %>%
summarise(count = n(), .groups = "drop")
ArticleDTM <- ArticleTokenDF %>%
cast_dtm(item_link, word, count)
TargetK <- min(round(MinCount * .8, 0), 10)
#ArticleLDA <- LDA(ArticleDTM, k = TargetK, control = list(seed = 1234))
ArticleLDA <- LDA(ArticleDTM, k = TargetK)
ArticleTopics_Word <- tidy(ArticleLDA, matrix = "beta")
ArticleTopTerms <- ArticleTopics_Word %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ArticleTopTerms_Name <- ArticleTopTerms %>%
arrange(topic, desc(beta)) %>%
group_by(topic) %>%
mutate(RowID = row_number(),
topic_beta = sum(beta)) %>%
ungroup() %>%
pivot_wider(id_cols = c(topic, topic_beta),
names_from = RowID, values_from = term) %>%
mutate(topic_name = paste(`1`, `2`, `3`, sep = " ")) %>%
dplyr::select(topic, topic_beta, topic_name)
# Re-categorize a topic as miscellaneous
#ArticleTopTerms_Name <- ArticleTopTerms_Name%>%
# mutate(topic_name = ifelse(topic_beta == min(topic_beta),
# "MISC", topic_name))
ArticleTopics_Doc <- tidy(ArticleLDA, matrix = "gamma") %>%
group_by(document) %>%
mutate(Maxgamma = max(gamma)) %>%
ungroup() %>%
filter(gamma == Maxgamma) %>%
dplyr::select(document, topic) %>%
left_join(ArticleTopTerms_Name, by = c("topic"))
We start by pulling the RSS feeds for the six news sources. Then we follow the article links and clean-up the article text to prepare it for further analysis.
One challenge is that not all six sources report an equal number of articles. Therefore we determine the news source with the least articles and only utilize that same number of articles from the other sources using random selection. Some articles may be empty of significant text (only contain video or external links) so those empty articles are ignored for subsequent analysis.
# Article Summary
TimeStamp = Sys.time()
attr(TimeStamp, "tzone") <- "US/Eastern"
tibble(
Description = c("Total articles",
paste0("Source with least articles : ",
filter(SelectorDF, feed_count == min(feed_count))$feed_title)[1],
"Revised total articles after random selection",
"Empty Articles",
"Total word count (net of stop words)",
"Unique word count"),
Value = c(sum(SelectorDF$feed_count),
min(SelectorDF$feed_count),
nrow(ArticleDF),
nrow(ArticleDF %>%
filter(str_length(item_articletext) <= 10)),
nrow(ArticleTokenDF),
length(unique(ArticleTokenDF$word)))
) %>%
kable(caption = paste0('<p style="color:black; font-size:18px"> Article Summary - ',
format(TimeStamp, "%m/%d/%Y %H:%M"),
'</p>')) %>%
kable_styling("striped") %>%
row_spec(0, color = "white", background = "black")
| Description | Value |
|---|---|
| Total articles | 129 |
| Source with least articles : USA | 9 |
| Revised total articles after random selection | 54 |
| Empty Articles | 0 |
| Total word count (net of stop words) | 9178 |
| Unique word count | 3724 |
Topics and the news change daily but which topics are reported more, less or not at all by each side?
Trying to determine topics using static methods is difficult. Here are a few methods and their challenges.
Keyword Topics : The news changes daily and a static list of keywords will never keep up with what might be hot news. Imagine that there is a hamster worshiping cult in Laramie Wyoming that suddenly makes the news for something tragic or wonderful. Chances are that neither "Laramie" or "hamster" would be in your list of hot keywords.
Source Labeled Topics : First, the sources may label the same topic in slightly different ways making it difficult to tie topics across sources. Second, we want to remove any meta-bias in how a source might label their own stories. From our previous example, perhaps one source's "cult" is another source's "charity".
Therefore we will be using a machine learning algorithm known as Latent Dirichlet Allocation or LDA to determine the topics.
LDA is a popular method for sorting documents into topics. It treats each document as a mixture of topics, and each topic as a mixture of words. This allows documents to "overlap" each other in terms of content, rather than being separated into discrete groups, in a way that mirrors typical use of natural language. For our purposes we will only be classifying each article to its' most probable topic.
If you want to understand LDA in more depth there is Wikipedia and a easier to read medium article with pictures.
ArticleTopTerms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered() +
theme_classic() +
labs(title = "Article Topic Model Groups",
subtitle = "By Most Probable Terms within Topic",
x = "Probability", y = "Term")
Because the topics are mathematically generated they have no real name. The number of topics is a parameter we can define for LDA and I currently have it set to 75% of the net article count by source. If there were 6 sources of 10 articles each, LDA will split that into 8 topics for us.
For labeling purposes we will be identifying each topic by it's three most probable keywords. In late 2020, you might ended up with something like "trump election biden" as a topic. That doesn't mean that every article in that group contains even one of those three words or even sounds like it could be related to them. Typically the total probabilities of the top 5 words in a given topic add up to less than 0.10 meaning that 90% of the article does not contain those words.
Also note that all words have been "stemmed" to de-duplicate words down to their root meaning. Words like "placing", "placed" and "places" will be stemmed to "place". This is great for text analytics but sometimes creates odd results like "suprem" instead of "supreme".
# Add the article topics to the main DF
ArticleDF <- ArticleDF %>%
left_join(ArticleTopics_Doc, by = c("item_link" = "document")) %>%
mutate(item_length = str_length(item_articletext),
item_empty = ifelse(item_length <= 10, "Y", "N"))
Now that the topics have been labeled, it's time to see which source types are reporting or not reporting each topic.
ArticleDF %>%
filter(str_length(item_articletext) > 10) %>%
group_by(feed_type, topic, topic_name) %>%
summarise(topic_count = n(), .groups = "drop") %>%
ggplot(aes(feed_type, topic_name, fill = topic_count)) +
geom_tile() +
# geom_label(aes(feed_type, topic_name, label = topic_count), fill = "white") +
scale_fill_gradient(low = "light grey", high = "dark red") +
theme_classic() +
labs(title = "Article Count by Topic and Source Type",
x = "Type", y = "Topic", fill = "Topic Count")
Let's break that down further by looking at the topics reported by source.
ArticleDF %>%
filter(str_length(item_articletext) > 10) %>%
mutate(feed_title = factor(feed_title, levels = c("HUF", "CBS",
"BBC", "USA",
"FOX", "WTM"))) %>%
group_by(feed_title, topic, topic_name) %>%
summarise(topic_count = n(), .groups = "drop") %>%
ggplot(aes(feed_title, topic_name, fill = topic_count)) +
geom_tile() +
scale_fill_gradient(low = "light grey", high = "dark red") +
theme_classic() +
labs(title = "Article Count by Topic and Source Name",
x = "Type", y = "Topic", fill = "Topic Count")
To examine this in more detail, here are lists including article titles and links of anything either side excluded.
ArticleTopic_Counts <- ArticleDF %>%
group_by(feed_type, topic_name) %>%
summarise(topic_count = n(), .groups = "drop") %>%
filter(!is.na(topic_name) & topic_name != "NA") %>%
pivot_wider(names_from = feed_type, values_from = topic_count) %>%
rename(TopicRptLeft = Left, TopicRptCenter = Center, TopicRptRight = Right)
ArticleDF <- ArticleDF %>%
mutate(item_display_link = cell_spec(item_title, "html",
link = item_link)) %>%
left_join(ArticleTopic_Counts, by = c("topic_name"))
# Filter to articles not reported by right
if (nrow(ArticleDF %>% filter(is.na(TopicRptRight))) < 1) {
cat("There are no articles with topics not reported by the right")
} else {
ArticleDF %>%
filter(is.na(TopicRptRight)) %>%
dplyr::select(feed_title, feed_type, item_display_link, topic_name) %>%
rename(Feed = feed_title,
Bias = feed_type,
Link = item_display_link,
Topic = topic_name) %>%
kable("html", escape = FALSE,
caption = '<p style="color:blue; font-size:18px">Topics Not Reported by Right</p>') %>%
kable_styling("striped",
bootstrap_options = c("hover", "condensed")) %>%
row_spec(0, color = "white", background = "black") %>%
scroll_box(height = "500px")
}
# Filter to articles not reported by left
if (nrow(ArticleDF %>% filter(is.na(TopicRptLeft))) < 1) {
cat("There are no articles with topics not reported by the left")
} else {
ArticleDF %>%
filter(is.na(TopicRptLeft)) %>%
dplyr::select(feed_title, feed_type, item_display_link, topic_name) %>%
rename(Feed = feed_title,
Bias = feed_type,
Link = item_display_link,
Topic = topic_name) %>%
kable("html", escape = FALSE,
caption = '<p style="color:red; font-size:18px">Topics Not Reported by Left</p>') %>%
kable_styling("striped",
bootstrap_options = c("hover", "condensed")) %>%
row_spec(0, color = "white", background = "black") %>%
scroll_box(height = "500px")
}
| Feed | Bias | Link | Topic |
|---|---|---|---|
| BBC | Center | 'I built a career from true crime and make-up' | polic video offic |
| BBC | Center | Thomas Lane: Ex-policeman jailed for role in George Floyd killing | polic video offic |
| FOX | Right | DC hotel shooting leaves one person injured after 'barricade' situation: Police | polic video offic |
| FOX | Right | North Carolina police officers shoot suspect while responding to kidnapping call, caught on doorbell video | polic video offic |
| FOX | Right | California mom on bus platform assaulted by man trying to take her toddler | polic video offic |
| USA | Center | 'One date ... and now I'm being sued': Woman sues man for $10K for standing her up on a date | polic video offic |
| USA | Center | Great white shark washes up on Long Island beach in New York, police say | polic video offic |
# Before we spend $ and time getting sentiments, we will see if these have been previously reported. If so we will flag them so that they can be filtered from the API calls
OldArticleDF <- readRDS(file = "NewsMachine_ArticleDF.RDS") %>%
dplyr::select(item_link, sentiment, positive, neutral, negative) %>%
mutate(reported = "Y")
ArticleDF <- ArticleDF %>%
left_join(OldArticleDF %>%
dplyr::select(item_link, reported), by = "item_link")
## GET ARTICLE SENTIMENTS
# We will use the Microsoft cognitive services API
tak <- rawToChar(readRDS(paste0("/home/woolyadmin/R/tak.rds"))) # My Microsoft API key
#MSURL31 <- "https://eastus.api.cognitive.microsoft.com/text/analytics/v3.1-preview.3/sentiment?showStats=TRUE"
MSURL31 <- "https://eastus.api.cognitive.microsoft.com/text/analytics/v3.1/sentiment?showStats=TRUE"
WipDF <- ArticleDF %>%
filter(str_length(item_articletext) > 10 &
is.na(reported))
if (nrow(WipDF) > 0) {
for (i in 1:nrow(WipDF)) {
# print(paste0(i, " ", WipDF$item_title[i]))
DocDF <- tibble(
language = "en",
id = WipDF$item_link[i],
text = WipDF$item_articletext[i]) %>%
mutate(text = str_sub(text, 1, 5001))
DocJSON <- DocDF
DocJSON <- list(documents = DocJSON) %>% toJSON()
SentimentResult <- POST(
url = MSURL31,
body = DocJSON,
config=add_headers(
c('Content-Type'='application/json',
'Ocp-Apim-Subscription-Key' = tak)))
TempArticleDF <- fromJSON(
content(SentimentResult, as="text"))$documents %>%
as_tibble()
TempSentenceDF <-
fromJSON(
content(SentimentResult, as="text"))$documents$sentences[[1]] %>%
as_tibble() %>%
mutate(id = DocDF$id[1])
if (i == 1) {
WipArticleSentimentDF <- TempArticleDF
WipSentenceSentimentDF <- TempSentenceDF
} else {
WipArticleSentimentDF <- bind_rows(WipArticleSentimentDF,
TempArticleDF)
WipSentenceSentimentDF <- bind_rows(WipSentenceSentimentDF,
TempSentenceDF)
}
}
# Resolve the embedded data frames in the Article Sentiment DF
WipArticleSentimentDF <- bind_cols(WipArticleSentimentDF %>%
dplyr::select(id, sentiment),
WipArticleSentimentDF$confidenceScores)
WipSentenceSentimentDF <- bind_cols(WipSentenceSentimentDF %>%
dplyr::select(id, sentiment, offset,
length, text),
WipSentenceSentimentDF$confidenceScores)
# Add the article sentiments to the main article data frame
WipDF <- WipDF %>%
filter(is.na(reported)) %>%
left_join(WipArticleSentimentDF,
by = c("item_link" = "id"))
}
ArticleDF <- ArticleDF %>%
left_join(OldArticleDF %>%
dplyr::select(-reported),
by = c("item_link"))
# Add New Articles if present
if (nrow(WipDF) > 0) {
ArticleDF <- bind_rows(ArticleDF,
WipDF)
}
ArticleDF <- ArticleDF %>%
mutate(sentiment = factor(sentiment,
levels = c("positive", "mixed",
"neutral", "negative", "")))
Is each side reporting certain topics in a more positive or more negative tone than the other side?
There are many packages and methods for getting word sentiment. However just getting isolated sentiment of words is too simplistic and can lead to the wrong conclusion.
The best sentiment engines use sequence and context to generate more reliable results. Since we do not have the time or will to develop that independently we are fortunate that there are options out there at Amazon, Google, IBM, Microsoft and others. For our purposes, we are going to use Microsoft's Cognitive Services API to get sentiment. There is a cost for heavy usage but for this project I think we can stay within the 5K per month free tier.
The API delivers results at the sentence level and then aggregates that to the document (article) level. Here is a sampling at the document level.
ArticleDF[1:3,] %>%
dplyr::select(item_title, sentiment, positive, neutral, negative) %>%
rename(Title = item_title, Sentiment = sentiment, Positive = positive,
Neutral = neutral, Negative = negative) %>%
kable("html", escape = FALSE,
caption = '<p style="color:black; font-size:18px">Sentiment Results Example</p>') %>%
kable_styling("striped",
bootstrap_options = c("hover", "condensed")) %>%
row_spec(0, color = "white", background = "black")
| Title | Sentiment | Positive | Neutral | Negative |
|---|---|---|---|---|
| Dave Chappelle show cancelled over transgender jokes controversy | NA | NA | NA | NA |
| Dave Chappelle show cancelled over transgender jokes controversy | negative | 0.01 | 0.01 | 0.98 |
| Dave Chappelle show cancelled over transgender jokes controversy | NA | NA | NA | NA |
Now let's explore that sentiment by topic and source type. Generally it's interesting but not surprising how little of the sentiment of all news articles is positive. It reinforces the axiom that "Bad News Sells".
ArticleDF %>%
filter(!is.na(sentiment)) %>%
group_by(feed_type, topic_name, sentiment) %>%
summarise(sentiment_count = n(), .groups = "drop") %>%
ggplot(aes(sentiment_count, topic_name, fill = sentiment)) +
geom_col() +
facet_wrap(~ feed_type, ncol = 3) +
scale_fill_manual(breaks = c("positive", "neutral", "mixed", "negative"),
values = c("light green", "light blue", "light grey", "black")) +
theme_classic() +
labs(title = "Article Sentiment Count by Topic and Source Type",
x = "Topic Count", y = "Topic", fill = "Sentiment")
A breakdown by source reveals who is the most negative overall.
ArticleDF %>%
mutate(feed_title = factor(feed_title, levels = c("HUF", "CBS",
"BBC", "USA",
"FOX", "WTM"))) %>%
group_by(feed_title, topic_name, sentiment) %>%
summarise(sentiment_count = n(), .groups = "drop") %>%
ggplot(aes(sentiment_count, topic_name, fill = sentiment)) +
geom_col() +
facet_wrap(~ feed_title, ncol = 6) +
scale_fill_manual(breaks = c("positive", "neutral", "mixed", "negative"),
values = c("light green", "light blue", "light grey", "black")) +
theme_classic() +
labs(title = "Article Sentiment Count by Topic and Source Name",
x = "Topic Count", y = "Topic", fill = "Sentiment")
Are articles with that report on certain keywords reported more, less or not at all by each side and what is the sentiment of articles with those keywords?
We are starting this list small and will eventually grow it based on the the results from the LDA analytics. For now the keywords are broken down as follows:
Left Word Group : Last names of key political figures and terms associated with the right. Right Word Group : Last names of key political figures and terms associated with the left. Topic Group : Independent topics that could align with either group.
The word groups will be evaluated as a whole while the topics will be evaluated individually.
Keywords <- tibble(
word = c("trump", "pence", "mcconnell", "mccarthy", "republican",
"conservative"),
type = rep("Right Word Group", 6)) %>%
bind_rows(tibble(
word = c("biden", "harris", "pelosi", "schumer", "democrat",
"liberal", "progressive"),
type = rep("Left Word Group", 7))) %>%
bind_rows(tibble(
word = c("christian", "muslim", "immigration", "riot", "defund",
"protest", "lgbtq", "vaccine", "covid"),
type = rep("Topic Group", 9)))
Keywords %>%
rename(Word = word, Type = type) %>%
kable("html", escape = FALSE,
caption = '<p style="color:black; font-size:18px">Keyword Topics and Groupings</p>') %>%
kable_styling("striped",
bootstrap_options = c("hover", "condensed")) %>%
row_spec(0, color = "white", background = "black") %>%
scroll_box(height = "500px")
| Word | Type |
|---|---|
| trump | Right Word Group |
| pence | Right Word Group |
| mcconnell | Right Word Group |
| mccarthy | Right Word Group |
| republican | Right Word Group |
| conservative | Right Word Group |
| biden | Left Word Group |
| harris | Left Word Group |
| pelosi | Left Word Group |
| schumer | Left Word Group |
| democrat | Left Word Group |
| liberal | Left Word Group |
| progressive | Left Word Group |
| christian | Topic Group |
| muslim | Topic Group |
| immigration | Topic Group |
| riot | Topic Group |
| defund | Topic Group |
| protest | Topic Group |
| lgbtq | Topic Group |
| vaccine | Topic Group |
| covid | Topic Group |
If an article has a keyword in it 3 or more times then it is included in the keyword analysis. First, a view by source type.
KeywordSentimentDF <- ArticleTokenDF %>%
left_join(Keywords, by = c("word")) %>%
filter(count >= 3 & !is.na(type)) %>%
mutate(KeyWord = case_when(
type != "Topic Group" ~ type,
TRUE ~ word)) %>%
group_by(item_link, KeyWord) %>%
summarise(n = n(), .groups = c("drop")) %>%
left_join(ArticleDF %>%
dplyr::select(item_link, feed_type, feed_title, sentiment),
by = c("item_link"))
KeywordSentimentDF %>%
ggplot(aes(n, KeyWord, fill = sentiment)) +
geom_col() +
facet_wrap(~ feed_type, ncol = 3) +
scale_fill_manual(breaks = c("positive", "neutral", "mixed", "negative"),
values = c("light green", "light blue", "light grey", "black")) +
theme_classic() +
labs(title = "Article Sentiment Count by Select Keyword Groups and Source Type",
x = "Topic Count", y = "Topic", fill = "Sentiment")
Now a deeper look by source name.
XMax <- KeywordSentimentDF %>%
group_by(feed_title) %>%
summarise(n = n(), .groups = "drop")
XMax <- max(XMax$n)
XScale <- round(XMax / 2, 0)
KeywordSentimentDF %>%
mutate(feed_title = factor(feed_title, levels = c("HUF", "CBS",
"BBC", "USA",
"FOX", "WTM"))) %>%
ggplot(aes(n, KeyWord, fill = sentiment)) +
geom_col() +
facet_wrap(~ feed_title, ncol = 6) +
scale_x_continuous(breaks = seq(0, XMax, by = XScale)) +
scale_fill_manual(breaks = c("positive", "neutral", "mixed", "negative"),
values = c("light green", "light blue", "light grey", "black")) +
theme_classic() +
labs(title = "Article Sentiment Count by Select Keyword Groups and Source Name",
x = "Topic Count", y = "Topic", fill = "Sentiment")
Now that you have seen some objective measures of potential bias you can consume the news by...
Being savvy
ArticleDF %>%
mutate(UnreportedBy = case_when(
is.na(TopicRptRight) & !is.na(TopicRptLeft) ~ "Right",
is.na(TopicRptLeft) & !is.na(TopicRptRight) ~ "Left",
is.na(TopicRptCenter) & (!is.na(TopicRptRight) | !is.na(TopicRptLeft)) ~ "Center",
TRUE ~ ""
)) %>%
rename(Feed = feed_title,
Bias = feed_type,
Link = item_display_link,
Topic = topic_name,
Sentiment =sentiment) %>%
dplyr::select(Topic, Bias, Feed, Link, Sentiment, UnreportedBy) %>%
arrange(Topic, Bias, Feed, Link, Sentiment) %>%
kable("html", escape = FALSE,
caption = paste0('<p style="color:black; font-size:18px">',
'Appendix - All Articles',
'</p>')) %>%
kable_styling("striped",
bootstrap_options = c("hover", "condensed")) %>%
row_spec(0, color = "white", background = "black") %>%
scroll_box(height = "500px")
Note : I have plans to evaluate the news over time now that I have this process running and collecting data nearly every day. If you have some ideas for exploring it further, please leave a comment.
TestDataset01 <- dbConnect(
bigrquery::bigquery(),
project = "tokyo-portal-227113",
dataset = "TestDataset01",
billing = "tokyo-portal-227113")
# savvy-analytics-42.demonstration
Demonstration <- dbConnect(
bigrquery::bigquery(),
project = "savvy-analytics-42",
dataset = "demonstration",
billing = "savvy-analytics-42")
# If there are no new articles then no need to output
if (nrow(WipDF) > 0) {
TodaysArticleDF = ArticleDF %>%
filter(is.na(reported)) %>%
dplyr::select(-reported)
PriorArticleDF = readRDS("NewsMachine_ArticleDF.RDS")
# Append the current article DF (exclude previously reported)
# to the historical article DF
ALLArticleDF <- bind_rows(TodaysArticleDF, PriorArticleDF)
# Save the historical articles
saveRDS(ALLArticleDF, file = "NewsMachine_ArticleDF.RDS")
# Save th historical articles to BQ
bq_auth(path = "BQAPIKey.json")
dbWriteTable(TestDataset01, "NewsMachine_Article", TodaysArticleDF, append = TRUE)
bq_auth(path = "BQSavvyDemonstration.json")
dbWriteTable(Demonstration, "NewsMachine_Article", TodaysArticleDF, append = TRUE)
# Append the current sentence sentiment to the RDS file
ALLSentenceSentimentDF <- bind_rows(WipSentenceSentimentDF,
readRDS("NewsMachine_SentenceSentimentDF.RDS"))
# Save the historical sentence sentiments
saveRDS(ALLSentenceSentimentDF, file = "NewsMachine_SentenceSentimentDF.RDS")
}