Conducting a sentiment analysis of newpaper articles

  • Describe data, etc
  • This is the primary dataset we are analyzing. It consists of 2000+ articles from 19 newspaper anthologies across Oregon, dated mostly between 2014 and 2020. The articles were gathered by students in Dr. Melissa Haeffner’s Freshman Oregon Water Stories class at Portland State University.

Importing data

rm(list=ls()) # I like to use this to clear my global environment

import <- read.csv('/Users/aakashupraity/Desktop/owsfinal2021/rawdata.csv', header=TRUE, stringsAsFactors = FALSE)

Cleaning the data

library(dplyr)
library(lubridate)
colnames(import)[colnames(import)=="DATE.PUB...MM.DD.YYYY."] <-"when" #Date article was published
colnames(import)[colnames(import)=="NEWSPAPER"] <-"paper" #Name of publication
colnames(import)[colnames(import)=="TITLE"] <-"article" #Title of article
colnames(import)[colnames(import)=="CITATION"] <-"cit" #Citation
colnames(import)[colnames(import)=="PUB..LOCATION"] <-"place" #Location of publishing house
colnames(import)[colnames(import)=="FULL.CLEANED.TEXT"] <-"edited" #Edited article text
colnames(import)[colnames(import)=="FILE.NAME"] <-"code" #Article identifier
import <- janitor::remove_empty(import, which = "cols") #removes empty columns
import$when <- lubridate:: mdy(import$when) #categorize data as formatted date


data <- import %>% 
  dplyr::select(when, place, paper, article, cit, code, edited) %>% 
  group_by(article)

str(data)
## tibble [2,431 × 7] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ when   : Date[1:2431], format: "2001-05-02" "2004-04-07" ...
##  $ place  : chr [1:2431] "Dallas" "Dallas" "Portland" "Portland" ...
##  $ paper  : chr [1:2431] "Polk County Itemizer-Observer" "Polk County Itemizer-Observer" "Willamette Weekly" "Willamette Weekly" ...
##  $ article: chr [1:2431] "Water quality focus of advisory council" "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer" "Something in the Water" "\"Witchcraft\" For Bureaucrats" ...
##  $ cit    : chr [1:2431] "Water quality focus of advisory council. (2001, May 2).Polk County Itemizer-Observer. Retrieved from http://www"| __truncated__ "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer. (2004, April 7).Polk County Itemi"| __truncated__ "Bundick, A. N. (2005, March 29). Something in the Water. Willamette Week. Retrieved from http://www.wweek.com/p"| __truncated__ "Brosy, A. (2006, August 15). \"Witchcraft\" For Bureaucrats. Willamette Weekly. Retrieved from http://www.wweek"| __truncated__ ...
##  $ code   : chr [1:2431] "DA012PC122" "DA042PC123" "PO051WW043" "PO063WW028" ...
##  $ edited : chr [1:2431] "Water quality and availability has been in the headlines a lot lately, but as most rural residents know, water "| __truncated__ "The Luckiamute Watershed Council works to improve water quality for humans, fish and wildlife.The Luckiamute Wa"| __truncated__ "Portlanders have voted down fluoridation three times, but now the state Legislature may force it down our throa"| __truncated__ "Dick Torpey squints at the hot summer sky and slowly walks across a parking lot with two thin, yard-long rods h"| __truncated__ ...
##  - attr(*, "groups")= tibble [2,408 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ article: chr [1:2408] " ‘Polluted by Money’ series underscores our commitment to journalism in the public interest (column)" " Portland restaurants scramble after Friday's boil water notice." "‘Affordability concerns’: Costly arsenic solution will trickle down to city’s residents" "‘Carbon pollution isn’t free’: How Oregon can cap, trade, reinvest" ...
##   ..$ .rows  : list<int> [1:2408] 
##   .. ..$ : int 1946
##   .. ..$ : int 113
##   .. ..$ : int 1684
##   .. ..$ : int 804
##   .. ..$ : int 755
##   .. ..$ : int 2050
##   .. ..$ : int 664
##   .. ..$ : int 1457
##   .. ..$ : int 306
##   .. ..$ : int 1780
##   .. ..$ : int 41
##   .. ..$ : int 2157
##   .. ..$ : int 2055
##   .. ..$ : int 2065
##   .. ..$ : int 4
##   .. ..$ : int 819
##   .. ..$ : int 1343
##   .. ..$ : int 204
##   .. ..$ : int 611
##   .. ..$ : int 133
##   .. ..$ : int 651
##   .. ..$ : int 636
##   .. ..$ : int 352
##   .. ..$ : int 403
##   .. ..$ : int 181
##   .. ..$ : int 610
##   .. ..$ : int 1065
##   .. ..$ : int 224
##   .. ..$ : int 2409
##   .. ..$ : int 726
##   .. ..$ : int 1034
##   .. ..$ : int 1851
##   .. ..$ : int 922
##   .. ..$ : int 706
##   .. ..$ : int 893
##   .. ..$ : int 1507
##   .. ..$ : int 1030
##   .. ..$ : int 465
##   .. ..$ : int 1654
##   .. ..$ : int 149
##   .. ..$ : int 1696
##   .. ..$ : int 483
##   .. ..$ : int 980
##   .. ..$ : int 952
##   .. ..$ : int 1080
##   .. ..$ : int 738
##   .. ..$ : int 2395
##   .. ..$ : int 540
##   .. ..$ : int 1231
##   .. ..$ : int 2031
##   .. ..$ : int 277
##   .. ..$ : int 155
##   .. ..$ : int 1858
##   .. ..$ : int 2096
##   .. ..$ : int 992
##   .. ..$ : int 2027
##   .. ..$ : int 76
##   .. ..$ : int 1557
##   .. ..$ : int 1818
##   .. ..$ : int 1009
##   .. ..$ : int 688
##   .. ..$ : int 191
##   .. ..$ : int 1887
##   .. ..$ : int 1513
##   .. ..$ : int 449
##   .. ..$ : int 568
##   .. ..$ : int 2415
##   .. ..$ : int 1239
##   .. ..$ : int 515
##   .. ..$ : int 2057
##   .. ..$ : int 1484
##   .. ..$ : int 2052
##   .. ..$ : int 107
##   .. ..$ : int 1765
##   .. ..$ : int 1961
##   .. ..$ : int 185
##   .. ..$ : int 564
##   .. ..$ : int 1769
##   .. ..$ : int 1168
##   .. ..$ : int 2136
##   .. ..$ : int 1438
##   .. ..$ : int 358
##   .. ..$ : int 460
##   .. ..$ : int 459
##   .. ..$ : int 1933
##   .. ..$ : int 1313
##   .. ..$ : int 2040
##   .. ..$ : int 775
##   .. ..$ : int 690
##   .. ..$ : int 123
##   .. ..$ : int 752
##   .. ..$ : int 1963
##   .. ..$ : int 1190
##   .. ..$ : int 578
##   .. ..$ : int 2162
##   .. ..$ : int 732
##   .. ..$ : int 1638
##   .. ..$ : int 2266
##   .. ..$ : int 895
##   .. .. [list output truncated]
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

Inputing lexicons, dictionaries and stop words

library(tidytext)
library(tm)

safety<- c("Health","Quality","Chemical","Lead","Hazardous","Test","Tests","Temperature","Chemicals","hazards","Testing","kit","Testing","kits","advisory","Notices","Notice","lead","mercury","nitrates","cyanobacteria","arsenic","nitrate","cyanobacterium","Quality","safety","safe","dangerous","recommendations","regulations","recommendation","regulation","fish","advisories","boil","protected","clean","recreation","chlorine","chloromine","estuarine","habitat","pollution","contamination","oil","spill","cleanup","botulism","clear","organisms","bacteria","E.","Coli","virus","parasite","toxic","algae","blooms","salmon","sewage","steelhead","tainted","turbid","turbidity","taste")

affordability<- c("Fund","Funds","Cost","Costs","Low","Income","Vulnerable","Bills","Tax","Bill","Taxes","rate","rates","relief","payments","moratorium","funding")


reliability<- c("Shortages","Shortage","aquifer","precipitation","renovation","culvert","renovations","repair","dam","source","lines","pipe","sewer","recharge","supply","tide","gate","tide","gates","repairs","infrastructure","dams","pipes","sewers","storage","retention","construction","irrigation","restoration","drainage","levee","dike","well","wells","damage","damages","damaged","overflow","combined","sewer","overflow","leak","leaks","leaky","plumbing","utility","irrigation","canals","king","tides","water","rights","recycled","desalination","reclamation","delivery","takings","permit","conserve","conservation","transfers","upstream","downstream","fire","wastewater","drinking","water","main","water","main","break","reservoir")


availability<- c("drought","droughts","flood","floods","scarcity","precipitation","rain","snow","snowmelt","rainwater","groundwater","outflow","ice","tsunami","receding","recede","snowpack","storms")

Oregon_stop_words <- bind_rows(tibble(word = c("water","Oregon", "city", "city's", "Bend", "Baker", "warm","springs", "oregon", "Portland", "portland", "it's", "Vale", "we're", "vale", "bend"),  #function to automatically add upper and lower cases?
                                      lexicon = c("custom")), 
                               stop_words)

Creating a tidy dataset of articles post 2016…

tidyarticles <- data %>%
  filter(when > "2015-12-31") %>%
  group_by(article, place, when) 

Add classification variables based on Oregon Water Resource Districts

tidyarticles$OWRD <- "UNASSIGNED"
tidyarticles$OWRD[grep("Portland", tidyarticles$place)] <- "North West Region" 
tidyarticles$OWRD[grep("Astoria", tidyarticles$place)] <- "North West Region" 
tidyarticles$OWRD[grep("Clatskanie", tidyarticles$place)] <- "North West Region" 
tidyarticles$OWRD[grep("Salem", tidyarticles$place)] <- "North West Region" 
tidyarticles$OWRD[grep("Tillamook", tidyarticles$place)] <- "North West Region" 
tidyarticles$OWRD[grep("Dallas", tidyarticles$place)] <- "North West Region"
tidyarticles$OWRD[grep("Newport", tidyarticles$place)] <- "North West Region" 
tidyarticles$OWRD[grep("Baker City", tidyarticles$place)] <- "East Region" 
tidyarticles$OWRD[grep("Burns", tidyarticles$place)] <- "East Region" 
tidyarticles$OWRD[grep("Ontario", tidyarticles$place)] <- "East Region" 
tidyarticles$OWRD[grep("Vale", tidyarticles$place)] <- "East Region" 
tidyarticles$OWRD[grep("La Grande", tidyarticles$place)] <- "East Region" 
tidyarticles$OWRD[grep("Hood River", tidyarticles$place)] <- "North Central Region" 
tidyarticles$OWRD[grep("Pendleton", tidyarticles$place)] <- "North Central Region" 
tidyarticles$OWRD[grep("The Dalles", tidyarticles$place)] <- "North Central Region" 
tidyarticles$OWRD[grep("Bend", tidyarticles$place)] <- "South Central Region" 
tidyarticles$OWRD[grep("Klamath Falls", tidyarticles$place)] <- "South Central Region" 
tidyarticles$OWRD[grep("Lakeview", tidyarticles$place)] <- "South Central Region" 
tidyarticles$OWRD[grep("Sisters", tidyarticles$place)] <- "South Central Region" 
tidyarticles$OWRD[grep("Warm Springs", tidyarticles$place)] <- "South Central Region" 
tidyarticles$OWRD[grep("Brookings", tidyarticles$place)] <- "South West Region" 
tidyarticles$OWRD[grep("Medford", tidyarticles$place)] <- "South West Region" 
tidyarticles$OWRD[grep("Roseburg", tidyarticles$place)] <- "South West Region" 
tidyarticles$OWRD[grep("Coos Bay", tidyarticles$place)] <- "South West Region" 

And now, tokenizing

tokenarticles <- tidyarticles %>%
  group_by(article, place, when, OWRD) %>% 
  unnest_tokens(word,edited)%>% 
  filter(when > "2015-12-31") %>% 
  anti_join(stop_words) %>% 
  anti_join(Oregon_stop_words)

I want to see a breakdown of articles by location/ OWRD

tidyarticles %>%
  group_by(place) %>% 
  distinct(article) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  print.data.frame()
##            place   n
## 1          Salem 251
## 2       Portland 245
## 3        Newport 184
## 4  Klamath Falls 176
## 5           Bend 128
## 6       Coos Bay 128
## 7      Pendleton 122
## 8      Tillamook 102
## 9       Roseburg 100
## 10       Ontario  97
## 11         Burns  68
## 12    Hood River  58
## 13  Warm Springs  56
## 14    Baker City  39
## 15    The Dalles  34
## 16    Clatskanie  31
## 17        Dallas  28
## 18       Sisters  24
## 19     Brookings  20
## 20     La Grande  20
## 21       Medford  20
## 22          Vale  20
## 23       Astoria  10
## 24      Lakeview   9
tidyarticles %>%
  group_by(OWRD) %>% 
  distinct(article) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  print.data.frame()
##                   OWRD   n
## 1    North West Region 851
## 2 South Central Region 393
## 3    South West Region 268
## 4          East Region 244
## 5 North Central Region 214

Finally resulting in 1970 articles for analysis

What is a Sentiment Analysis

The Bing et. al Lexicon is one of the more basic available lexicons. It categorizes words purely as ‘negative’ or ‘positive values’ - but, I’m changing it a bit

library(tidyr)

bing <- tokenarticles %>% 
  inner_join(get_sentiments("bing")) %>% # I'm telling R to join a column of bing sentiment values to my dataset
  count(place, OWRD, when, article, sentiment) %>% # creating a count column of my articles and their cumulative sentiments
  spread(sentiment, n, fill=0) %>%  # splitting that column based on the +/- sentiments...
  mutate(sentiment = positive - negative) %>% #...to analyze  them again
  rename(bingraw = sentiment) %>% # and now renaming
  dplyr::select(-positive, -negative) #and tidying

I now have a sentiment analysis of the text using the Bing lexicon: I have the cumulative positive and negative values for each article

head(bing)%>% 
  arrange(desc(when))
## # A tibble: 6 x 5
## # Groups:   article, place, when, OWRD [6]
##   article                                 place   when       OWRD        bingraw
##   <chr>                                   <chr>   <date>     <chr>         <dbl>
## 1 "‘Extreme draining’ of Oregon reservoi… Portla… 2019-05-21 North West…     -10
## 2 " ‘Polluted by Money’ series underscor… Portla… 2019-03-23 North West…       7
## 3 "‘Affordability concerns’: Costly arse… Ontario 2018-10-09 East Region      -5
## 4 "‘Carbon pollution isn’t free’: How Or… Portla… 2017-03-09 North West…      19
## 5 "‘Connecting Past to Future’: Tribal l… Hood R… 2017-01-13 North Cent…       0
## 6 "‘Hotter, earlier, longer’: Groups pla… Hood R… 2016-08-27 North Cent…       0

Doing the same with the Afinn lexicon; this lexicon is slightly different, as it provides words with a score from -5 to 5

  • Will need to process it differently
afinn <- tokenarticles %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(place, OWRD, when, article, value) %>% 
  summarise(afinnraw = sum(value)) %>% # Summarizing my article Afinn scores slightly differently here
  drop_na() %>% 
  summarise(afinnraw = sum(afinnraw))
head(afinn)
## # A tibble: 6 x 5
## # Groups:   place, OWRD, when [6]
##   place  OWRD        when       article                                 afinnraw
##   <chr>  <chr>       <date>     <chr>                                      <dbl>
## 1 Astor… North West… 2016-06-21 Astoria city dam likely to survive qua…       -9
## 2 Astor… North West… 2016-06-22 Stormwater projects top of the list in…       15
## 3 Astor… North West… 2017-12-21 Commercial Crabbing to Start in January        0
## 4 Astor… North West… 2018-01-09 Oregon transportation workers spray it…      -18
## 5 Astor… North West… 2018-01-12 Salmon are losing their genetic divers…       -3
## 6 Astor… North West… 2018-01-15 Knappa Water Association flushing wate…       -1

And then I created a single dataset with both aggregated lexicon scores

duolex <- data.frame(inner_join(afinn, bing))
duosent <- duolex %>%
  gather(key= "sentiment", value = "scores", -c(place, OWRD, when, article)) #alternatively, use pivot_longer to grab just 1 key-value pair
head(duosent)
##     place              OWRD       when
## 1 Astoria North West Region 2016-06-21
## 2 Astoria North West Region 2016-06-22
## 3 Astoria North West Region 2017-12-21
## 4 Astoria North West Region 2018-01-09
## 5 Astoria North West Region 2018-01-12
## 6 Astoria North West Region 2018-01-15
##                                                                            article
## 1                                         Astoria city dam likely to survive quake
## 2                    Stormwater projects top of the list in Port of Astoria budget
## 3                                          Commercial Crabbing to Start in January
## 4 Oregon transportation workers spray it safe on Clatsop County highways in winter
## 5                                        Salmon are losing their genetic diversity
## 6                                    Knappa Water Association flushing water mains
##   sentiment scores
## 1  afinnraw     -9
## 2  afinnraw     15
## 3  afinnraw      0
## 4  afinnraw    -18
## 5  afinnraw     -3
## 6  afinnraw     -1

I decided to wrangle my data some more.

library(scales)
duosent$abscores <- abs(duosent$scores) # created a new column of absolute sentiment values
duosent$perc <- rescale(duosent$scores, to=c(0,100)) # creating a new column of normalized scores - converting the entire range of sentiment scores to a 0-100 scale 
duosent$overall <- ifelse(duosent$scores >0, "positive", "negative") #and yet another column of another variable
head(duosent)
##     place              OWRD       when
## 1 Astoria North West Region 2016-06-21
## 2 Astoria North West Region 2016-06-22
## 3 Astoria North West Region 2017-12-21
## 4 Astoria North West Region 2018-01-09
## 5 Astoria North West Region 2018-01-12
## 6 Astoria North West Region 2018-01-15
##                                                                            article
## 1                                         Astoria city dam likely to survive quake
## 2                    Stormwater projects top of the list in Port of Astoria budget
## 3                                          Commercial Crabbing to Start in January
## 4 Oregon transportation workers spray it safe on Clatsop County highways in winter
## 5                                        Salmon are losing their genetic diversity
## 6                                    Knappa Water Association flushing water mains
##   sentiment scores abscores     perc  overall
## 1  afinnraw     -9        9 48.73646 negative
## 2  afinnraw     15       15 57.40072 positive
## 3  afinnraw      0        0 51.98556 negative
## 4  afinnraw    -18       18 45.48736 negative
## 5  afinnraw     -3        3 50.90253 negative
## 6  afinnraw     -1        1 51.62455 negative

And then got to plotting!

I wanted to plot some data to see if I could see anything…

library(ggplot2)
plotduosent <- ggplot(duosent, aes(x=when, y=scores, color=overall)) +
  geom_jitter()+
  geom_smooth(color="black", linetype ="dashed") + 
  xlab("") #I'm hiding this axis on purpose!

plotduosent+
  theme_minimal()

whereplot <- ggplot(duosent, aes(x=place, y=scores))
whenplot <- ggplot(duosent, aes(x=when,y=scores))


whereplot+
 # geom_bar(stat='identity', aes(levels(factor(fill=duosent$overall))), position="dodge")+ # I like to write out the code of the tasks I'm trying to accomplish even if the syntax isn't correct
  geom_bar(stat = "identity", position="dodge", aes(y=abscores, fill=overall))+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 60))

This figure shows all positive and negative sentiment scores in all the newspapers from Oregon.

It seems like sentiment scores are pretty evenly positive and negative across most locations - Roseburg and Hood River MAYBE are some of the obvious exceptions.

And now, I’m interested in sentiment patterns across all locations over time…

whenplot+
  geom_bar(stat = "identity", position="stack", aes(y=scores, fill=overall))+ #I'm constructing this even though it will barely be visible
  geom_smooth(method="loess")+ #chose the loess method for smoothing because of the presence of outliers
  guides(fill=FALSE)+ #removed the legend
  scale_x_date(date_labels = "%y")+
  theme_minimal()+
  theme_linedraw()+
  facet_wrap(~place, ncol = 4)

Resulting in:

What is a topic model

LDA Defn

“LDA is a mathematical method for …. finding the mixture of words that is associated with each topic, while also determining the mixture of topics that describes each document.” Silge 2017

Provides a beta: per-topic-per-word probability

We want to find topics for each of the cities

This involves first dividing up our dataset into dataframes of each location

token_Portland<-subset(tokenarticles, place=="Portland")
token_Salem<-subset(tokenarticles, place=="Salem")
token_Sisters<-subset(tokenarticles, place=="Sisters")
token_Newport<-subset(tokenarticles, place=="Newport")
token_Klamath_Falls<-subset(tokenarticles, place=="Klamath Falls")
token_Bend<-subset(tokenarticles, place=="Bend")
token_Coos_Bay<-subset(tokenarticles, place=="Coos Bay")
token_Pendleton<-subset(tokenarticles, place=="Pendleton")
token_Tillamook<-subset(tokenarticles, place=="Tillamook")
token_Roseburg<-subset(tokenarticles, place=="Roseburg")
token_Ontario<-subset(tokenarticles, place=="Ontario")
token_Burns<-subset(tokenarticles, place=="Burns")
token_Hood_River<-subset(tokenarticles, place=="Hood River")
token_Warm_Springs<-subset(tokenarticles, place=="Warm Springs")
token_Baker_City<-subset(tokenarticles, place=="Baker City")
token_The_Dalles<-subset(tokenarticles, place=="The Dalles")
token_Clatskanie<-subset(tokenarticles, place=="Clatskanie")
token_Dallas<-subset(tokenarticles, place=="Dallas")
token_Brookings<-subset(tokenarticles, place=="Brookings")
token_La_Grande<-subset(tokenarticles, place=="La Grande")
token_Medford<-subset(tokenarticles, place=="Medford")
token_Vale<-subset(tokenarticles, place=="Vale")
token_Astoria<-subset(tokenarticles, place=="Astoria")
token_Lakeview<-subset(tokenarticles, place=="Lakeview")

Looking first at Warm Springs

library(topicmodels)
library(tidyr)


Warm_Springs_dtm <- token_Warm_Springs %>% 
  anti_join(stop_words) %>%
  anti_join(Oregon_stop_words) %>%   
  count(article, word, sort = TRUE) %>%
  cast_dtm(article, word, n)
  
Warm_Springs_LDA <- LDA(Warm_Springs_dtm, k = 4)
Warm_Springs_LDA
## A LDA_VEM topic model with 4 topics.
Warm_Spring_topics <- tidy(Warm_Springs_LDA, matrix = "beta")
Warm_Spring_topics
## # A tibble: 11,976 x 3
##    topic term          beta
##    <int> <chr>        <dbl>
##  1     1 treatment 0.000724
##  2     2 treatment 0.0145  
##  3     3 treatment 0.0108  
##  4     4 treatment 0.00173 
##  5     1 community 0.00482 
##  6     2 community 0.00636 
##  7     3 community 0.00434 
##  8     4 community 0.0123  
##  9     1 tribal    0.00301 
## 10     2 tribal    0.0127  
## # … with 11,966 more rows

the 10 terms that are most common within each topic

Warm_Springs_top_terms <- Warm_Spring_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
Warm_Springs_top_terms 
## # A tibble: 40 x 3
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 salmon  0.0139 
##  2     1 tribes  0.0116 
##  3     1 fish    0.0111 
##  4     1 dams    0.0106 
##  5     1 judge   0.00890
##  6     1 river   0.00867
##  7     1 tower   0.00835
##  8     1 spill   0.00723
##  9     1 court   0.00723
## 10     1 federal 0.00698
## # … with 30 more rows
Warm_Springs_top_terms  %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

Astoria

Baker City

Bend

Brookings

Burns

Clatskanie

Coos_Bay

Dallas

Klamath_Falls

Hood_RIver

La_Grande

Lakeview

Medford

Newport

Ontario

Portland

Pendleton

Salem

Tillamook

Roseburg

Sisters

The_Dalles

Vale

First, dropping all repetitions of words within articles

saracounts <- tokenarticles %>% distinct(article, word, .keep_all=TRUE)

Conducting a row-wise word comparison and count

saracounts$safety<- as.numeric(saracounts$word %in% safety)
saracounts$reliability<- as.numeric(saracounts$word %in% reliability)
saracounts$availibility<- as.numeric(saracounts$word %in% availability)
saracounts$affordability<- as.numeric(saracounts$word %in% affordability)

And now, dropping rows where no SARA words were found

saracounttidy <- saracounts[rowSums(saracounts[, 6:9] == 0) < 4,]

Resulting in these three datasets

saratidyarticle <- aggregate(cbind(safety,reliability, availibility, affordability) ~ article, data = saracounttidy, FUN = sum, na.rm = TRUE)
saratidyplace <- aggregate(cbind(safety,reliability, availibility, affordability) ~ place, data = saracounttidy, FUN = sum, na.rm = TRUE)
saratidywrd <- aggregate(cbind(safety,reliability, availibility, affordability) ~ OWRD, data = saracounttidy, FUN = sum, na.rm = TRUE)

saratidywrd
##                   OWRD safety reliability availibility affordability
## 1          East Region    371         743          202            77
## 2 North Central Region    432         735          142            87
## 3    North West Region   1990        2461          540           267
## 4 South Central Region    936        1565          290           138
## 5    South West Region    582         737          150            65
saratidyplace
##            place safety reliability availibility affordability
## 1        Astoria     19          22            4             2
## 2     Baker City     41         120           51            17
## 3           Bend    323         666          119            41
## 4      Brookings     39          54           14             1
## 5          Burns    116         177           54            37
## 6     Clatskanie     73          91           18             8
## 7       Coos Bay    263         312           45            46
## 8         Dallas     43          93            6            11
## 9     Hood River    122         142           36            15
## 10 Klamath Falls    463         623          127            50
## 11     La Grande     30          46           15             5
## 12      Lakeview      9          31           16             3
## 13       Medford     41          57           16             5
## 14       Newport    328         426           76            42
## 15       Ontario    144         333           52            17
## 16     Pendleton    228         467           97            58
## 17      Portland    808         843          162            86
## 18      Roseburg    239         314           75            13
## 19         Salem    504         763          216            81
## 20       Sisters     35          81           20            16
## 21    The Dalles     82         126            9            14
## 22     Tillamook    215         223           58            37
## 23          Vale     40          67           30             1
## 24  Warm Springs    106         164            8            28
head(saratidyarticle,5)
##                                                                                                             article
## 1               ‘Polluted by Money’ series underscores our commitment to journalism in the public interest (column)
## 2                           ‘Affordability concerns’: Costly arsenic solution will trickle down to city’s residents
## 3                                                ‘Carbon pollution isn’t free’: How Oregon can cap, trade, reinvest
## 4 ‘Connecting Past to Future’: Tribal liaison Paul Lumley speaks about issues, crises facing tribes along the river
## 5                       ‘Extreme draining’ of Oregon reservoir eliminates invasive species, helps endangered salmon
##   safety reliability availibility affordability
## 1      5           0            0             0
## 2      2           4            0             0
## 3      5           1            2             0
## 4      2           3            0             0
## 5      3           6            0             0

And now, creating a bar chart for SARA word count and regions

First, create a long, not wide, dataset from our raw SARA word counts

library(ggplot2)
library(scales)

t_saraowrd <- saratidywrd %>% 
  gather(safety:affordability, key= SARA, value = Count)

t_saraowrd$OWRD <- as.factor(t_saraowrd$OWRD)
t_saraowrd$SARA <- as.factor(t_saraowrd$SARA)

str(t_saraowrd)
## 'data.frame':    20 obs. of  3 variables:
##  $ OWRD : Factor w/ 5 levels "East Region",..: 1 2 3 4 5 1 2 3 4 5 ...
##  $ SARA : Factor w/ 4 levels "affordability",..: 4 4 4 4 4 3 3 3 3 3 ...
##  $ Count: num  371 432 1990 936 582 ...

Calculating percentages by region

p_owrd <- group_by(t_saraowrd, OWRD) %>% 
  mutate(percent = ((Count/sum(Count)*100)))

p_owrd <- p_owrd[, -3] 

str(p_owrd)
## tibble [20 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ OWRD   : Factor w/ 5 levels "East Region",..: 1 2 3 4 5 1 2 3 4 5 ...
##  $ SARA   : Factor w/ 4 levels "affordability",..: 4 4 4 4 4 3 3 3 3 3 ...
##  $ percent: num [1:20] 26.6 30.9 37.8 32 37.9 ...
##  - attr(*, "groups")= tibble [5 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ OWRD : Factor w/ 5 levels "East Region",..: 1 2 3 4 5
##   ..$ .rows: list<int> [1:5] 
##   .. ..$ : int [1:4] 1 6 11 16
##   .. ..$ : int [1:4] 2 7 12 17
##   .. ..$ : int [1:4] 3 8 13 18
##   .. ..$ : int [1:4] 4 9 14 19
##   .. ..$ : int [1:4] 5 10 15 20
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

And then, plotting first a general SARA breakdown (%) across all regions

p1 <-  ggplot(p_owrd) +
  geom_col(aes(x=SARA, y=percent), stat= 'identity', position= 'dodge')+
  theme_minimal()
p1

And now, creating a bar chart for SARA word count and regions

p1 <-  ggplot(p_owrd) +
  geom_col(aes(x=SARA, y=percent, fill = SARA), stat= 'identity', position= 'dodge')+
  facet_wrap(p_owrd$OWRD)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 35))
p1

Now trying the same code for a bar chart for SARA word count and regions

library(ggplot2)
library(scales)

t_saraplace <- saratidyplace %>% 
  gather(safety:affordability, key= SARA, value = Count)

t_saraplace$place <- as.factor(t_saraplace$place)
t_saraplace$SARA <- as.factor(t_saraplace$SARA)

str(t_saraplace)
## 'data.frame':    96 obs. of  3 variables:
##  $ place: Factor w/ 24 levels "Astoria","Baker City",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ SARA : Factor w/ 4 levels "affordability",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Count: num  19 41 323 39 116 73 263 43 122 463 ...
p_place <- group_by(t_saraplace, place) %>% 
  mutate(percent = ((Count/sum(Count)*100)))

p_place <- p_place[, -3] 

str(p_place)
## tibble [96 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ place  : Factor w/ 24 levels "Astoria","Baker City",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ SARA   : Factor w/ 4 levels "affordability",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ percent: num [1:96] 40.4 17.9 28.1 36.1 30.2 ...
##  - attr(*, "groups")= tibble [24 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ place: Factor w/ 24 levels "Astoria","Baker City",..: 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ .rows: list<int> [1:24] 
##   .. ..$ : int [1:4] 1 25 49 73
##   .. ..$ : int [1:4] 2 26 50 74
##   .. ..$ : int [1:4] 3 27 51 75
##   .. ..$ : int [1:4] 4 28 52 76
##   .. ..$ : int [1:4] 5 29 53 77
##   .. ..$ : int [1:4] 6 30 54 78
##   .. ..$ : int [1:4] 7 31 55 79
##   .. ..$ : int [1:4] 8 32 56 80
##   .. ..$ : int [1:4] 9 33 57 81
##   .. ..$ : int [1:4] 10 34 58 82
##   .. ..$ : int [1:4] 11 35 59 83
##   .. ..$ : int [1:4] 12 36 60 84
##   .. ..$ : int [1:4] 13 37 61 85
##   .. ..$ : int [1:4] 14 38 62 86
##   .. ..$ : int [1:4] 15 39 63 87
##   .. ..$ : int [1:4] 16 40 64 88
##   .. ..$ : int [1:4] 17 41 65 89
##   .. ..$ : int [1:4] 18 42 66 90
##   .. ..$ : int [1:4] 19 43 67 91
##   .. ..$ : int [1:4] 20 44 68 92
##   .. ..$ : int [1:4] 21 45 69 93
##   .. ..$ : int [1:4] 22 46 70 94
##   .. ..$ : int [1:4] 23 47 71 95
##   .. ..$ : int [1:4] 24 48 72 96
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
p1 <- ggplot(p_place) +
  geom_col(aes(x=place, y=percent, fill = SARA), stat= 'identity', position= 'fill')+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 35))
p1 

p1 <-  ggplot(p_place) +
  geom_col(aes(x=SARA, y=percent, fill = SARA), stat= 'identity', position= 'dodge')+
  facet_wrap(p_place$place)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 35))
p1

IMPORTANT: Necessary to equalize the number of words in all dictionaries

Next steps: Functions

Easy function examples

GIS