rm(list=ls()) # I like to use this to clear my global environment
library(dplyr)
library(tidytext)
library(tidyr)
library(janitor)
library(lubridate)
library(textdata)
library(ggplot2)
library(tm)
library(stringr)
library(scales)
library(topicmodels)
setwd('/Users/aakashupraity/Desktop/')
import <- read.csv('/Users/aakashupraity/Desktop/owsdata.csv', header=TRUE, stringsAsFactors = FALSE)
str(import) #Our initial dataset is a long and wide spreadsheet of dates, text, and numbers, with ill-advised names
## 'data.frame': 2434 obs. of 17 variables:
## $ PUB..LOCATION : chr "Dallas" "Dallas" "Portland" "Portland" ...
## $ NEWSPAPER : chr "Polk County Itemizer-Observer" "Polk County Itemizer-Observer" "Willamette Weekly" "Willamette Weekly" ...
## $ TITLE : chr "Water quality focus of advisory council" "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer" "Something in the Water" "\"Witchcraft\" For Bureaucrats" ...
## $ AUTHOR..Last.Name..First.Initial..: chr NA NA "Budnick, A. N." "Brosy, A." ...
## $ DATE.PUB...MM.DD.YYYY. : chr "5/2/01" "4/7/04" "3/29/05" "8/15/06" ...
## $ QUARTER.PUBLISHED : int 2 2 1 3 4 4 3 3 4 1 ...
## $ DATE.ACCESSED..MM.DD.YYYY. : chr "1/29/19" "1/29/19" "1/30/18" "2/13/18" ...
## $ VOLUME : int NA NA NA NA NA NA NA NA NA NA ...
## $ ISSUE.NUMBER : int NA NA NA NA NA NA NA NA NA NA ...
## $ PAGE : chr "" "" "" "" ...
## $ LINK : chr "http://www.polkio.com/news/2001/may/02/water-quality-focus-of-advisory-council/" "http://www.polkio.com/news/2004/apr/07/just-what-is-the-luckiamute-watershed-council/" "http://www.wweek.com/portland/article-4261-something-in-the-water.html" "http://www.wweek.com/portland/article-5953-witchcraft-for-bureaucrats.html" ...
## $ CITATION : chr "Water quality focus of advisory council. (2001, May 2).Polk County Itemizer-Observer. Retrieved from http://www.polkio.com/news "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer. (2004, April 7).Polk County Itemizer-Observer. Re "Bundick, A. N. (2005, March 29). Something in the Water. Willamette Week. Retrieved from http://www.wweek.com/portland/article- "Brosy, A. (2006, August 15). \"Witchcraft\" For Bureaucrats. Willamette Weekly. Retrieved from http://www.wweek.com/portland/ar ...
## $ DATA.ENTRY. : chr "X" "X" "X" "X" ...
## $ FILE.NAME : chr "DA012PC122" "DA042PC123" "PO051WW043" "PO063WW028" ...
## $ ORIGINAL.FILE.NAME : chr "not scraped" "not scraped" "not scraped" "not scraped" ...
## $ CLEAN.TXT. : chr "X" "X" "" "" ...
## $ FULL.CLEANED.TEXT : chr "Water quality and availability has been in the headlines a lot lately, but as most rural residents know, water is always a top "The Luckiamute Watershed Council works to improve water quality for humans, fish and wildlife.The Luckiamute Watershed Council "Portlanders have voted down fluoridation three times, but now the state Legislature may force it down our throats-literally.A b "Dick Torpey squints at the hot summer sky and slowly walks across a parking lot with two thin, yard-long rods held lightly in f ...
colnames(import)[colnames(import)=="DATE.PUB...MM.DD.YYYY."] <-"when" #Date article was published
colnames(import)[colnames(import)=="NEWSPAPER"] <-"paper" #Name of publication
colnames(import)[colnames(import)=="TITLE"] <-"article" #Title of article
colnames(import)[colnames(import)=="CITATION"] <-"cit" #Citation
colnames(import)[colnames(import)=="PUB..LOCATION"] <-"place" #Location of publishing house
colnames(import)[colnames(import)=="FULL.CLEANED.TEXT"] <-"edited" #Edited article text
colnames(import)[colnames(import)=="FILE.NAME"] <-"code" #Article identifier
import <- janitor::remove_empty(import, which = "cols") #removes empty columns
import$when <- lubridate:: mdy(import$when) #categorize data as formatted date
data <- import %>%
dplyr::select(when, place, paper, article, cit, code, edited) %>% #creating a dataset with only the information I'm interested in
mutate(linenumber=row_number(edited)) %>% #creating an index to better keep track of variables
group_by(article) # for now
ta <- data %>%
group_by(article, place, when) %>%
unnest_tokens(word,edited) # %>% breaking up my edited articles by words
#anti_join(stop_words)%>% this command removes commonly used, low sentiment words; I won't use it now, but will have to in the future
tidyarticles <- ta%>%
filter(when > "2015-12-31")
head(tidyarticles)
## # A tibble: 6 x 4
## # Groups: article, place, when [1]
## article place when word
## <chr> <chr> <date> <chr>
## 1 State snowpack rises above normal levels Clatskanie 2016-01-08 cold
## 2 State snowpack rises above normal levels Clatskanie 2016-01-08 mountain
## 3 State snowpack rises above normal levels Clatskanie 2016-01-08 temperatures
## 4 State snowpack rises above normal levels Clatskanie 2016-01-08 and
## 5 State snowpack rises above normal levels Clatskanie 2016-01-08 copious
## 6 State snowpack rises above normal levels Clatskanie 2016-01-08 amounts
tidyarticles %>%
group_by(place) %>%
distinct(article) %>%
count() %>%
arrange(desc(n)) %>%
print.data.frame()
## place n
## 1 Salem 251
## 2 Portland 245
## 3 Newport 184
## 4 Klamath Falls 176
## 5 Bend 129
## 6 Coos Bay 128
## 7 Pendleton 122
## 8 Tillamook 102
## 9 Roseburg 100
## 10 Ontario 97
## 11 Burns 68
## 12 Hood River 58
## 13 Warm Springs 56
## 14 Baker City 39
## 15 The Dalles 34
## 16 Clatskanie 31
## 17 Dallas 28
## 18 Sisters 24
## 19 Brookings 20
## 20 La Grande 20
## 21 Medford 20
## 22 Vale 20
## 23 Astoria 10
## 24 Lakeview 9
bing <- tidyarticles %>%
inner_join(get_sentiments("bing")) %>% # I'm telling R to join a column of bing sentiment values to my dataset
count(place, when, article, sentiment) %>% # creating a count column of my articles and their cumulative sentiments
spread(sentiment, n, fill=0) %>% # splitting that column based on the +/- sentiments...
mutate(sentiment = positive - negative) %>% #...to analyze them again
rename(bingraw = sentiment) %>% # and now renaming
dplyr::select(-positive, -negative) #and tidying
head(bing)%>%
arrange(desc(when))
## # A tibble: 6 x 4
## # Groups: article, place, when [6]
## article place when bingraw
## <chr> <chr> <date> <dbl>
## 1 "\"Ocean Shorts\" Film Series opens at VAC" Newpo… 2019-08-08 5
## 2 "\"Watershed is saturated.' Irrigation officials sa… Ontar… 2019-05-30 -1
## 3 "\"Respect the Water\" Safety Committee meets in th… Roseb… 2019-05-23 3
## 4 " \xd4Polluted by Money\xd5 series underscores our … Portl… 2019-03-23 14
## 5 "__Gov. Brown wants to cancel water swap plans for … Salem 2017-10-30 -2
## 6 "(If) a pipeline runs through it" Coos … 2017-03-23 0
afinn <- tidyarticles %>%
inner_join(get_sentiments("afinn")) %>%
group_by(place, when, article, value) %>%
summarise(afinnraw = sum(value)) %>% # Summarizing my article Afinn scores slightly differently here
drop_na() %>%
summarise(afinnraw = sum(afinnraw))
head(afinn)
## # A tibble: 6 x 4
## # Groups: place, when [6]
## place when article afinnraw
## <chr> <date> <chr> <dbl>
## 1 Astoria 2016-06-21 Astoria city dam likely to survive quake 2
## 2 Astoria 2016-06-22 Stormwater projects top of the list in Port of As… 19
## 3 Astoria 2017-12-21 Commercial Crabbing to Start in January 0
## 4 Astoria 2018-01-09 Oregon transportation workers spray it safe on Cl… -17
## 5 Astoria 2018-01-12 Salmon are losing their genetic diversity 12
## 6 Astoria 2018-01-15 Knappa Water Association flushing water mains -1
duolex <- data.frame(inner_join(afinn, bing))
duosent <- duolex %>%
gather(key= "sentiment", value = "scores", -c(place, when, article)) #alternatively, use pivot_longer to grab just 1 key-value pair
head(duosent)
## place when
## 1 Astoria 2016-06-21
## 2 Astoria 2016-06-22
## 3 Astoria 2017-12-21
## 4 Astoria 2018-01-09
## 5 Astoria 2018-01-12
## 6 Astoria 2018-01-15
## article
## 1 Astoria city dam likely to survive quake
## 2 Stormwater projects top of the list in Port of Astoria budget
## 3 Commercial Crabbing to Start in January
## 4 Oregon transportation workers spray it safe on Clatsop County highways in winter
## 5 Salmon are losing their genetic diversity
## 6 Knappa Water Association flushing water mains
## sentiment scores
## 1 afinnraw 2
## 2 afinnraw 19
## 3 afinnraw 0
## 4 afinnraw -17
## 5 afinnraw 12
## 6 afinnraw -1
duosent$abscores <- abs(duosent$scores) # created a new column of absolute sentiment values
duosent$perc <- rescale(duosent$scores, to=c(0,100)) # creating a new column of normalized scores - converting the entire range of sentiment scores to a 0-100 scale
duosent$overall <- ifelse(duosent$scores >0, "positive", "negative") #and yet another column of another variable
head(duosent)
## place when
## 1 Astoria 2016-06-21
## 2 Astoria 2016-06-22
## 3 Astoria 2017-12-21
## 4 Astoria 2018-01-09
## 5 Astoria 2018-01-12
## 6 Astoria 2018-01-15
## article
## 1 Astoria city dam likely to survive quake
## 2 Stormwater projects top of the list in Port of Astoria budget
## 3 Commercial Crabbing to Start in January
## 4 Oregon transportation workers spray it safe on Clatsop County highways in winter
## 5 Salmon are losing their genetic diversity
## 6 Knappa Water Association flushing water mains
## sentiment scores abscores perc overall
## 1 afinnraw 2 2 37.91946 positive
## 2 afinnraw 19 19 43.62416 positive
## 3 afinnraw 0 0 37.24832 negative
## 4 afinnraw -17 17 31.54362 negative
## 5 afinnraw 12 12 41.27517 positive
## 6 afinnraw -1 1 36.91275 negative
plotduosent <- ggplot(duosent, aes(x=when, y=scores, color=overall)) +
geom_jitter()+
geom_smooth(color="black", linetype ="dashed") +
xlab("") #I'm hiding this axis on purpose!
plotduosent+
theme_minimal()
plotduosent <- ggplot(duosent, aes(x=when, y=scores, color=overall)) +
geom_jitter()+
geom_smooth(color="black", linetype ="dashed") +
geom_ribbon(aes(ymin=-150, ymax=150),
alpha=0.1, #transparency
linetype=1, #solid, dashed or other line types
colour="grey70", #border line color
size=1, #border line size
fill="green") #fill color
plotduosent+
theme_minimal()
whereplot <- ggplot(duosent, aes(x=place, y=scores))
whenplot <- ggplot(duosent, aes(x=when,y=scores))
whereplot+
# geom_bar(stat='identity', aes(levels(factor(fill=duosent$overall))), position="dodge")+ # I like to write out the code of the tasks I'm trying to accomplish even if the syntax isn't correct
geom_bar(stat = "identity", position="dodge", aes(y=abscores, fill=overall))+
theme_minimal()+
theme(axis.text.x = element_text(angle = 60))
This figure shows all positive and negative sentiment scores in all the newspapers from Oregon.
It seems like sentiment scores are pretty evenly positive and negative across most locations - Roseburg and Hood River (1 of them each!) are some of the obvious exceptions.
whenplot+
geom_bar(stat = "identity", position="stack", aes(y=scores, fill=overall))+ #I'm constructing this even though it will barely be visible
geom_smooth(method="loess")+ #chose the loess method for smoothing because of the presence of outliers
guides(fill=FALSE)+ #removed the legend
scale_x_date(date_labels = "%y")+
theme_minimal()+
theme_linedraw()+
facet_wrap(~place, ncol = 7)
Resulting in:
ta_td <- tidyarticles %>%
anti_join(stop_words, by = "word") %>%
count(article, word) %>%
cast_dtm(article, word, n)
ta_lda <- LDA(ta_td, k=6)
ta_lda
## A LDA_VEM topic model with 6 topics.
ta_topics <- tidy(ta_lda, matrix="beta")
ta_topics
## # A tibble: 276,888 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 1 0.000471
## 2 2 1 0.00121
## 3 3 1 0.00143
## 4 4 1 0.00136
## 5 5 1 0.000629
## 6 6 1 0.00112
## 7 1 100,000 0.000108
## 8 2 100,000 0.00000575
## 9 3 100,000 0.000142
## 10 4 100,000 0.000000146
## # … with 276,878 more rows
ta_top <- ta_topics %>%
group_by(topic) %>%
top_n(10,beta) %>%
ungroup() %>%
arrange(topic, -beta)
ta_top %>%
mutate(term=reorder(term, beta)) %>%
ggplot(aes(term, beta, fill=factor(topic)))+
geom_col(show.legend=FALSE)+
facet_wrap(~topic,scales="free")+
coord_flip()