Conducting a sentiment analysis of newpaper articles

The packages I loaded for this week

rm(list=ls()) # I like to use this to clear my global environment

library(dplyr)
library(tidytext)
library(tidyr)
library(janitor)
library(lubridate)
library(textdata)
library(ggplot2)
library(tm)
library(stringr)
library(scales)
library(topicmodels)


setwd('/Users/aakashupraity/Desktop/')

Loading my dataset: It’s a bit untidy (just a bit)

import <- read.csv('/Users/aakashupraity/Desktop/owsdata.csv', header=TRUE, stringsAsFactors = FALSE)
str(import) #Our initial dataset is a long and wide spreadsheet of dates, text, and numbers, with ill-advised names
## 'data.frame':    2434 obs. of  17 variables:
##  $ PUB..LOCATION                     : chr  "Dallas" "Dallas" "Portland" "Portland" ...
##  $ NEWSPAPER                         : chr  "Polk County Itemizer-Observer" "Polk County Itemizer-Observer" "Willamette Weekly" "Willamette Weekly" ...
##  $ TITLE                             : chr  "Water quality focus of advisory council" "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer" "Something in the Water" "\"Witchcraft\" For Bureaucrats" ...
##  $ AUTHOR..Last.Name..First.Initial..: chr  NA NA "Budnick, A. N." "Brosy, A." ...
##  $ DATE.PUB...MM.DD.YYYY.            : chr  "5/2/01" "4/7/04" "3/29/05" "8/15/06" ...
##  $ QUARTER.PUBLISHED                 : int  2 2 1 3 4 4 3 3 4 1 ...
##  $ DATE.ACCESSED..MM.DD.YYYY.        : chr  "1/29/19" "1/29/19" "1/30/18" "2/13/18" ...
##  $ VOLUME                            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ ISSUE.NUMBER                      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ PAGE                              : chr  "" "" "" "" ...
##  $ LINK                              : chr  "http://www.polkio.com/news/2001/may/02/water-quality-focus-of-advisory-council/" "http://www.polkio.com/news/2004/apr/07/just-what-is-the-luckiamute-watershed-council/" "http://www.wweek.com/portland/article-4261-something-in-the-water.html" "http://www.wweek.com/portland/article-5953-witchcraft-for-bureaucrats.html" ...
##  $ CITATION                          : chr  "Water quality focus of advisory council. (2001, May 2).Polk County Itemizer-Observer. Retrieved from http://www.polkio.com/news "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer. (2004, April 7).Polk County Itemizer-Observer. Re "Bundick, A. N. (2005, March 29). Something in the Water. Willamette Week. Retrieved from http://www.wweek.com/portland/article- "Brosy, A. (2006, August 15). \"Witchcraft\" For Bureaucrats. Willamette Weekly. Retrieved from http://www.wweek.com/portland/ar ...
##  $ DATA.ENTRY.                       : chr  "X" "X" "X" "X" ...
##  $ FILE.NAME                         : chr  "DA012PC122" "DA042PC123" "PO051WW043" "PO063WW028" ...
##  $ ORIGINAL.FILE.NAME                : chr  "not scraped" "not scraped" "not scraped" "not scraped" ...
##  $ CLEAN.TXT.                        : chr  "X" "X" "" "" ...
##  $ FULL.CLEANED.TEXT                 : chr  "Water quality and availability has been in the headlines a lot lately, but as most rural residents know, water is always a top  "The Luckiamute Watershed Council works to improve water quality for humans, fish and wildlife.The Luckiamute Watershed Council  "Portlanders have voted down fluoridation three times, but now the state Legislature may force it down our throats-literally.A b "Dick Torpey squints at the hot summer sky and slowly walks across a parking lot with two thin, yard-long rods held lightly in f ...

I changed some names to simplify my analyses…

colnames(import)[colnames(import)=="DATE.PUB...MM.DD.YYYY."] <-"when" #Date article was published
colnames(import)[colnames(import)=="NEWSPAPER"] <-"paper" #Name of publication
colnames(import)[colnames(import)=="TITLE"] <-"article" #Title of article
colnames(import)[colnames(import)=="CITATION"] <-"cit" #Citation
colnames(import)[colnames(import)=="PUB..LOCATION"] <-"place" #Location of publishing house
colnames(import)[colnames(import)=="FULL.CLEANED.TEXT"] <-"edited" #Edited article text
colnames(import)[colnames(import)=="FILE.NAME"] <-"code" #Article identifier
import <- janitor::remove_empty(import, which = "cols") #removes empty columns
import$when <- lubridate:: mdy(import$when) #categorize data as formatted date

…reduced the size of the dataset while still retaining all important information

data <- import %>% 
  dplyr::select(when, place, paper, article, cit, code, edited) %>% #creating a dataset with only the information I'm interested in 
  mutate(linenumber=row_number(edited)) %>% #creating an index to better keep track of variables
  group_by(article) # for now

And, with some more wrangling…

ta <- data %>%
  group_by(article, place, when) %>% 
  unnest_tokens(word,edited) # %>% breaking up my edited articles by words
  #anti_join(stop_words)%>% this command removes commonly used, low sentiment words; I won't use it now, but will have to in the future

I then selected for dates after 2016 and now have a dataset that I can use for a sentiment analysis

tidyarticles <- ta%>% 
  filter(when > "2015-12-31")
head(tidyarticles)
## # A tibble: 6 x 4
## # Groups:   article, place, when [1]
##   article                                  place      when       word        
##   <chr>                                    <chr>      <date>     <chr>       
## 1 State snowpack rises above normal levels Clatskanie 2016-01-08 cold        
## 2 State snowpack rises above normal levels Clatskanie 2016-01-08 mountain    
## 3 State snowpack rises above normal levels Clatskanie 2016-01-08 temperatures
## 4 State snowpack rises above normal levels Clatskanie 2016-01-08 and         
## 5 State snowpack rises above normal levels Clatskanie 2016-01-08 copious     
## 6 State snowpack rises above normal levels Clatskanie 2016-01-08 amounts

I want to see a breakdown of articles by location

tidyarticles %>%
  group_by(place) %>% 
  distinct(article) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  print.data.frame()
##            place   n
## 1          Salem 251
## 2       Portland 245
## 3        Newport 184
## 4  Klamath Falls 176
## 5           Bend 129
## 6       Coos Bay 128
## 7      Pendleton 122
## 8      Tillamook 102
## 9       Roseburg 100
## 10       Ontario  97
## 11         Burns  68
## 12    Hood River  58
## 13  Warm Springs  56
## 14    Baker City  39
## 15    The Dalles  34
## 16    Clatskanie  31
## 17        Dallas  28
## 18       Sisters  24
## 19     Brookings  20
## 20     La Grande  20
## 21       Medford  20
## 22          Vale  20
## 23       Astoria  10
## 24      Lakeview   9

A lexicon is an existing dictionary used for evaluating emotions or opinions

The Bing et. al Lexicon is one of the more basic available lexicons. It categorizes words purely as ‘negative’ or ‘positive values’ - but, I’m changing it a bit

bing <- tidyarticles %>% 
  inner_join(get_sentiments("bing")) %>% # I'm telling R to join a column of bing sentiment values to my dataset
  count(place, when, article, sentiment) %>% # creating a count column of my articles and their cumulative sentiments
  spread(sentiment, n, fill=0) %>%  # splitting that column based on the +/- sentiments...
  mutate(sentiment = positive - negative) %>% #...to analyze  them again
  rename(bingraw = sentiment) %>% # and now renaming
  dplyr::select(-positive, -negative) #and tidying

I now have a sentiment analysis of the text using the Bing lexicon: I have the cumulative positive and negative values for each article

head(bing)%>% 
  arrange(desc(when))
## # A tibble: 6 x 4
## # Groups:   article, place, when [6]
##   article                                              place  when       bingraw
##   <chr>                                                <chr>  <date>       <dbl>
## 1 "\"Ocean Shorts\" Film Series opens at VAC"          Newpo… 2019-08-08       5
## 2 "\"Watershed is saturated.' Irrigation officials sa… Ontar… 2019-05-30      -1
## 3 "\"Respect the Water\" Safety Committee meets in th… Roseb… 2019-05-23       3
## 4 " \xd4Polluted by Money\xd5 series underscores our … Portl… 2019-03-23      14
## 5 "__Gov. Brown wants to cancel water swap plans for … Salem  2017-10-30      -2
## 6 "(If) a pipeline runs through it"                    Coos … 2017-03-23       0

Doing the same with the Afinn lexicon; this lexicon is slightly different, as it provides words with a score from -5 to 5

afinn <- tidyarticles %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(place, when, article, value) %>% 
  summarise(afinnraw = sum(value)) %>% # Summarizing my article Afinn scores slightly differently here
  drop_na() %>% 
  summarise(afinnraw = sum(afinnraw))
head(afinn)
## # A tibble: 6 x 4
## # Groups:   place, when [6]
##   place   when       article                                            afinnraw
##   <chr>   <date>     <chr>                                                 <dbl>
## 1 Astoria 2016-06-21 Astoria city dam likely to survive quake                  2
## 2 Astoria 2016-06-22 Stormwater projects top of the list in Port of As…       19
## 3 Astoria 2017-12-21 Commercial Crabbing to Start in January                   0
## 4 Astoria 2018-01-09 Oregon transportation workers spray it safe on Cl…      -17
## 5 Astoria 2018-01-12 Salmon are losing their genetic diversity                12
## 6 Astoria 2018-01-15 Knappa Water Association flushing water mains            -1

And then I created a single dataset with both aggregated lexicon scores

duolex <- data.frame(inner_join(afinn, bing))
duosent <- duolex %>%
  gather(key= "sentiment", value = "scores", -c(place, when, article)) #alternatively, use pivot_longer to grab just 1 key-value pair
head(duosent)
##     place       when
## 1 Astoria 2016-06-21
## 2 Astoria 2016-06-22
## 3 Astoria 2017-12-21
## 4 Astoria 2018-01-09
## 5 Astoria 2018-01-12
## 6 Astoria 2018-01-15
##                                                                            article
## 1                                         Astoria city dam likely to survive quake
## 2                    Stormwater projects top of the list in Port of Astoria budget
## 3                                          Commercial Crabbing to Start in January
## 4 Oregon transportation workers spray it safe on Clatsop County highways in winter
## 5                                        Salmon are losing their genetic diversity
## 6                                    Knappa Water Association flushing water mains
##   sentiment scores
## 1  afinnraw      2
## 2  afinnraw     19
## 3  afinnraw      0
## 4  afinnraw    -17
## 5  afinnraw     12
## 6  afinnraw     -1

I decided to wrangle my data some more.

duosent$abscores <- abs(duosent$scores) # created a new column of absolute sentiment values
duosent$perc <- rescale(duosent$scores, to=c(0,100)) # creating a new column of normalized scores - converting the entire range of sentiment scores to a 0-100 scale 
duosent$overall <- ifelse(duosent$scores >0, "positive", "negative") #and yet another column of another variable
head(duosent)
##     place       when
## 1 Astoria 2016-06-21
## 2 Astoria 2016-06-22
## 3 Astoria 2017-12-21
## 4 Astoria 2018-01-09
## 5 Astoria 2018-01-12
## 6 Astoria 2018-01-15
##                                                                            article
## 1                                         Astoria city dam likely to survive quake
## 2                    Stormwater projects top of the list in Port of Astoria budget
## 3                                          Commercial Crabbing to Start in January
## 4 Oregon transportation workers spray it safe on Clatsop County highways in winter
## 5                                        Salmon are losing their genetic diversity
## 6                                    Knappa Water Association flushing water mains
##   sentiment scores abscores     perc  overall
## 1  afinnraw      2        2 37.91946 positive
## 2  afinnraw     19       19 43.62416 positive
## 3  afinnraw      0        0 37.24832 negative
## 4  afinnraw    -17       17 31.54362 negative
## 5  afinnraw     12       12 41.27517 positive
## 6  afinnraw     -1        1 36.91275 negative

And then got to plotting!

I wanted to plot some data to see if I could see anything…

plotduosent <- ggplot(duosent, aes(x=when, y=scores, color=overall)) +
  geom_jitter()+
  geom_smooth(color="black", linetype ="dashed") + 
  xlab("") #I'm hiding this axis on purpose!

plotduosent+
  theme_minimal()

trying a ribbon plot

plotduosent <- ggplot(duosent, aes(x=when, y=scores, color=overall)) +
  geom_jitter()+
  geom_smooth(color="black", linetype ="dashed") + 
  geom_ribbon(aes(ymin=-150, ymax=150), 
                alpha=0.1,       #transparency
                linetype=1,      #solid, dashed or other line types
                colour="grey70", #border line color
                size=1,          #border line size
                fill="green")    #fill color


plotduosent+
  theme_minimal()

whereplot <- ggplot(duosent, aes(x=place, y=scores))
whenplot <- ggplot(duosent, aes(x=when,y=scores))


whereplot+
 # geom_bar(stat='identity', aes(levels(factor(fill=duosent$overall))), position="dodge")+ # I like to write out the code of the tasks I'm trying to accomplish even if the syntax isn't correct
  geom_bar(stat = "identity", position="dodge", aes(y=abscores, fill=overall))+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 60))

This figure shows all positive and negative sentiment scores in all the newspapers from Oregon.

It seems like sentiment scores are pretty evenly positive and negative across most locations - Roseburg and Hood River (1 of them each!) are some of the obvious exceptions.

And now, I’m interested in sentiment patterns across all locations over time…

whenplot+
  geom_bar(stat = "identity", position="stack", aes(y=scores, fill=overall))+ #I'm constructing this even though it will barely be visible
  geom_smooth(method="loess")+ #chose the loess method for smoothing because of the presence of outliers
  guides(fill=FALSE)+ #removed the legend
  scale_x_date(date_labels = "%y")+
  theme_minimal()+
  theme_linedraw()+
  facet_wrap(~place, ncol = 7)

Resulting in:

Lets try topic modelling!

ta_td <- tidyarticles %>%
  anti_join(stop_words, by = "word") %>%
  count(article, word) %>% 
  cast_dtm(article, word, n)


ta_lda <- LDA(ta_td, k=6)
ta_lda
## A LDA_VEM topic model with 6 topics.
ta_topics <- tidy(ta_lda, matrix="beta")
ta_topics
## # A tibble: 276,888 x 3
##    topic term           beta
##    <int> <chr>         <dbl>
##  1     1 1       0.000471   
##  2     2 1       0.00121    
##  3     3 1       0.00143    
##  4     4 1       0.00136    
##  5     5 1       0.000629   
##  6     6 1       0.00112    
##  7     1 100,000 0.000108   
##  8     2 100,000 0.00000575 
##  9     3 100,000 0.000142   
## 10     4 100,000 0.000000146
## # … with 276,878 more rows
ta_top <- ta_topics %>% 
  group_by(topic) %>% 
  top_n(10,beta) %>% 
  ungroup() %>% 
  arrange(topic, -beta)

ta_top %>% 
  mutate(term=reorder(term, beta)) %>% 
  ggplot(aes(term, beta, fill=factor(topic)))+
  geom_col(show.legend=FALSE)+
  facet_wrap(~topic,scales="free")+
  coord_flip()

And that’s it!

Next steps

  • Randomly sample articles (after weightage) to select articles for topic modeling, use modeled topics for key word searches.
  • Create a customizable word search interface for key word searches in a sentiment analysis.
  • Acquire demographic and election data at similar geographic scale; implement a GIS.
  • Corroborate key word searches with sentiment, with additional lexicons.
  • Word clouds, tf-idf