rm(list=ls()) # I like to use this to clear my global environment
library(dplyr)
library(tidytext)
library(tidyr)
library(janitor)
library(lubridate)
library(textdata)
library(ggplot2)
library(tm)
library(stringr)
library(scales)
setwd('/Users/aakashupraity/Desktop/')
import <- read.csv('/Users/aakashupraity/Desktop/owsdata.csv', header=TRUE, stringsAsFactors = FALSE, fileEncoding='UTF-8')
str(import) #Our initial dataset is a long and wide spreadsheet of dates, text, and numbers, with ill-advised names
## 'data.frame': 2431 obs. of 34 variables:
## $ PUB..LOCATION : chr "Dallas" "Dallas" "Portland" "Portland" ...
## $ NEWSPAPER : chr "Polk County Itemizer-Observer" "Polk County Itemizer-Observer" "Willamette Weekly" "Willamette Weekly" ...
## $ TITLE : chr "Water quality focus of advisory council" "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer" "Something in the Water" "\"Witchcraft\" For Bureaucrats" ...
## $ AUTHOR..Last.Name..First.Initial.. : chr NA NA "Budnick, A. N." "Brosy, A." ...
## $ DATE.PUB...MM.DD.YYYY. : chr "05/02/2001" "04/07/2004" "03/29/2005" "08/15/2006" ...
## $ QUARTER.PUBLISHED : int 2 2 1 3 4 4 3 3 4 1 ...
## $ DATE.ACCESSED..MM.DD.YYYY. : chr "1/29/2019" "1/29/2019" "1/30/2018" "2/13/2018" ...
## $ VOLUME : int NA NA NA NA NA NA NA NA NA NA ...
## $ ISSUE.NUMBER : int NA NA NA NA NA NA NA NA NA NA ...
## $ PAGE : chr "" "" "" "" ...
## $ LINK : chr "http://www.polkio.com/news/2001/may/02/water-quality-focus-of-advisory-council/" "http://www.polkio.com/news/2004/apr/07/just-what-is-the-luckiamute-watershed-council/" "http://www.wweek.com/portland/article-4261-something-in-the-water.html" "http://www.wweek.com/portland/article-5953-witchcraft-for-bureaucrats.html" ...
## $ CITATION : chr "Water quality focus of advisory council. (2001, May 2).Polk County Itemizer-Observer. Retrieved from http://www"| __truncated__ "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer. (2004, April 7).Polk County Itemi"| __truncated__ "Bundick, A. N. (2005, March 29). Something in the Water. Willamette Week. Retrieved from http://www.wweek.com/p"| __truncated__ "Brosy, A. (2006, August 15). \"Witchcraft\" For Bureaucrats. Willamette Weekly. Retrieved from http://www.wweek"| __truncated__ ...
## $ DATA.ENTRY. : chr "X" "X" "X" "X" ...
## $ FILE.NAME : chr "DA012PC122" "DA042PC123" "PO051WW043" "PO063WW028" ...
## $ ORIGINAL.FILE.NAME : chr "not scraped" "not scraped" "not scraped" "not scraped" ...
## $ CLEAN.TXT. : chr "X" "X" "" "" ...
## $ FULL.CLEANED.TEXT : chr "Water quality and availability has been in the headlines a lot lately, but as most rural residents know, water "| __truncated__ "The Luckiamute Watershed Council works to improve water quality for humans, fish and wildlife.The Luckiamute Wa"| __truncated__ "Portlanders have voted down fluoridation three times, but now the state Legislature may force it down our throa"| __truncated__ "Dick Torpey squints at the hot summer sky and slowly walks across a parking lot with two thin, yard-long rods h"| __truncated__ ...
## $ numbers.to.keep.track.of.OG.organization: chr "503" "504" "1347" "1348" ...
## $ X : chr "" "" "" "" ...
## $ X.1 : chr "" "" "" "" ...
## $ X.2 : chr "" "" "" "" ...
## $ X.3 : chr "" "" "" "" ...
## $ X.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ X.5 : logi NA NA NA NA NA NA ...
## $ X.6 : int NA NA NA NA NA NA NA NA NA NA ...
## $ X.7 : logi NA NA NA NA NA NA ...
## $ X.8 : logi NA NA NA NA NA NA ...
## $ X.9 : chr "" "" "" "" ...
## $ X.10 : chr "" "" "" "" ...
## $ X.11 : chr "" "" "" "" ...
## $ X.12 : chr "" "" "" "" ...
## $ X.13 : chr "" "" "" "" ...
## $ X.14 : chr "" "" "" "" ...
## $ X.15 : chr "" "" "" "" ...
colnames(import)[colnames(import)=="DATE.PUB...MM.DD.YYYY."] <-"when" #Date article was published
colnames(import)[colnames(import)=="NEWSPAPER"] <-"paper" #Name of publication
colnames(import)[colnames(import)=="TITLE"] <-"article" #Title of article
colnames(import)[colnames(import)=="CITATION"] <-"cit" #Citation
colnames(import)[colnames(import)=="PUB..LOCATION"] <-"place" #Location of publishing house
colnames(import)[colnames(import)=="FULL.CLEANED.TEXT"] <-"edited" #Edited article text
colnames(import)[colnames(import)=="FILE.NAME"] <-"code" #Article identifier
import <- janitor::remove_empty(import, which = "cols") #removes empty columns
import$when <- mdy(import$when) #categorize data as formatted date
data <- import %>%
dplyr::select(when, place, paper, article, cit, code, edited) %>% #creating a dataset with only the information I'm interested in
mutate(linenumber=row_number(edited)) %>% #creating an index to better keep track of variables
group_by(article) # for now
tidyarticles <- data %>%
group_by(article, place, when) %>%
unnest_tokens(word,edited) # %>% breaking up my edited articles by words
#anti_join(stop_words)%>% this command removes commonly used, low sentiment words; I won't use it now, but will have to in the future
## # A tibble: 6 x 4
## # Groups: article, place, when [1]
## article place when word
## <chr> <chr> <date> <chr>
## 1 Water quality focus of advisory council Dallas 2001-05-02 water
## 2 Water quality focus of advisory council Dallas 2001-05-02 quality
## 3 Water quality focus of advisory council Dallas 2001-05-02 and
## 4 Water quality focus of advisory council Dallas 2001-05-02 availability
## 5 Water quality focus of advisory council Dallas 2001-05-02 has
## 6 Water quality focus of advisory council Dallas 2001-05-02 been
bing <- tidyarticles %>%
inner_join(get_sentiments("bing")) %>% # I'm telling R to join a column of bing sentiment values to my dataset
count(place, when, article, sentiment) %>% # creating a count column of my articles and their cumulative sentiments
spread(sentiment, n, fill=0) %>% # splitting that column based on the +/- sentiments...
mutate(sentiment = positive - negative) %>% #...to analyze them again
rename(bingraw = sentiment) %>% # and now renaming
dplyr::select(-positive, -negative) #and tidying
head(bing)
## # A tibble: 6 x 4
## # Groups: article, place, when [6]
## article place when bingraw
## <chr> <chr> <date> <dbl>
## 1 " ‘Polluted by Money’ series underscores our com… "Portlan… 2019-03-23 14
## 2 " Portland restaurants scramble after Friday's b… "Portlan… 2014-05-23 -9
## 3 "‘Affordability concerns’: Costly arsenic soluti… "Ontario" 2018-10-09 -3
## 4 "‘Carbon pollution isn’t free’: How Oregon can c… "Portlan… 2017-03-09 36
## 5 "‘Connecting Past to Future’: Tribal liaison Pau… "Hood Ri… 2017-01-13 3
## 6 "‘Extreme draining’ of Oregon reservoir eliminat… "Portlan… 2019-05-21 -6
afinn <- tidyarticles %>%
inner_join(get_sentiments("afinn")) %>%
group_by(place, when, article, value) %>%
summarise(afinnraw = sum(value)) %>% # Summarizing my article Afinn scores slightly differently here
drop_na() %>%
summarise(afinnraw = sum(afinnraw))
head(afinn)
## # A tibble: 6 x 4
## # Groups: place, when [6]
## place when article afinnraw
## <chr> <date> <chr> <dbl>
## 1 Astoria 2016-06-21 Astoria city dam likely to survive quake 2
## 2 Astoria 2016-06-22 Stormwater projects top of the list in Port of As… 19
## 3 Astoria 2017-12-21 Commercial Crabbing to Start in January 0
## 4 Astoria 2018-01-09 Oregon transportation workers spray it safe on Cl… -17
## 5 Astoria 2018-01-12 Salmon are losing their genetic diversity 12
## 6 Astoria 2018-01-15 Knappa Water Association flushing water mains -1
duolex <- data.frame(inner_join(afinn, bing))
duosent <- duolex %>%
gather(key= "sentiment", value = "scores", -c(place, when, article)) #alternatively, use pivot_longer to grab just 1 key-value pair
head(duosent)
## place when
## 1 Astoria 2016-06-21
## 2 Astoria 2016-06-22
## 3 Astoria 2017-12-21
## 4 Astoria 2018-01-09
## 5 Astoria 2018-01-12
## 6 Astoria 2018-01-15
## article
## 1 Astoria city dam likely to survive quake
## 2 Stormwater projects top of the list in Port of Astoria budget
## 3 Commercial Crabbing to Start in January
## 4 Oregon transportation workers spray it safe on Clatsop County highways in winter
## 5 Salmon are losing their genetic diversity
## 6 Knappa Water Association flushing water mains
## sentiment scores
## 1 afinnraw 2
## 2 afinnraw 19
## 3 afinnraw 0
## 4 afinnraw -17
## 5 afinnraw 12
## 6 afinnraw -1
plotduosent <- ggplot(duosent, aes(x=when, y=scores)) +
geom_line() + #a line is not the best way to plot this kind of data!
xlab("") #I'm hiding this axis on purpose!
plotduosent
Not very helpful!
duosent <- duosent %>% group_by(when) %>% subset(when> "2014-01-01" & when < "2019-12-31") #setting a date range we're interested in
duosent$abscores <- abs(duosent$scores) # created a new column of absolute sentiment values
duosent$perc <- rescale(duosent$scores, to=c(0,100)) # creating a new column of normalized scores - converting the entire range of sentiment scores to a 0-100 scale
duosent$overall <- ifelse(duosent$scores >0, "positive", "negative") #and yet another column of another variable
head(duosent)
## # A tibble: 6 x 8
## # Groups: when [6]
## place when article sentiment scores abscores perc overall
## <chr> <date> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 Astor… 2016-06-21 Astoria city dam li… afinnraw 2 2 43.9 positi…
## 2 Astor… 2016-06-22 Stormwater projects… afinnraw 19 19 49.1 positi…
## 3 Astor… 2017-12-21 Commercial Crabbing… afinnraw 0 0 43.3 negati…
## 4 Astor… 2018-01-09 Oregon transportati… afinnraw -17 17 38.2 negati…
## 5 Astor… 2018-01-12 Salmon are losing t… afinnraw 12 12 47.0 positi…
## 6 Astor… 2018-01-15 Knappa Water Associ… afinnraw -1 1 43.0 negati…
whereplot <- ggplot(duosent, aes(x=place, y=scores))
whenplot <- ggplot(duosent, aes(x=when,y=scores))
whereplot+
# geom_bar(stat='identity', aes(levels(factor(fill=duosent$overall))), position="dodge")+ # I like to write out the code of the tasks I'm trying to accomplish even if the syntax isn't correct
geom_bar(stat = "identity", position="dodge", aes(y=abscores, fill=overall))+
theme_minimal()+
theme(axis.text.x = element_text(angle = 60))
This figure shows all positive and negative sentiment scores in all the newspapers from Oregon.
It seems like sentiment scores are pretty evenly positive and negative across most locations - Roseburg and Hood River (1 of them each!) are some of the obvious exceptions.
whenplot+
geom_bar(stat = "identity", position="stack", aes(y=scores, fill=overall))+ #I'm constructing this even though it will barely be visible
geom_smooth(method="loess")+ #chose the loess method for smoothing because of the presence of outliers
guides(fill=FALSE)+ #removed the legend
scale_x_date(date_labels = "%y")+
theme_minimal()+
theme_linedraw()+
facet_wrap(~place, ncol = 7)
Resulting in: