rm(list=ls()) # I like to use this to clear my global environment
library(dplyr)
library(tidytext)
library(tidyr)
library(janitor)
library(lubridate)
library(textdata)
library(ggplot2)
library(tm)
library(stringr)
library(scales)
library(topicmodels)
library(quanteda)
setwd('/Users/aakashupraity/Desktop/')
import <- read.csv('/Users/aakashupraity/Desktop/owsfinal.csv', header=TRUE, stringsAsFactors = FALSE)
str(import) #Our initial dataset is a long and wide spreadsheet of dates, text, and numbers, with ill-advised names
## 'data.frame': 2431 obs. of 34 variables:
## $ PUB..LOCATION : chr "Dallas" "Dallas" "Portland" "Portland" ...
## $ NEWSPAPER : chr "Polk County Itemizer-Observer" "Polk County Itemizer-Observer" "Willamette Weekly" "Willamette Weekly" ...
## $ TITLE : chr "Water quality focus of advisory council" "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer" "Something in the Water" "\"Witchcraft\" For Bureaucrats" ...
## $ AUTHOR..Last.Name..First.Initial.. : chr NA NA "Budnick, A. N." "Brosy, A." ...
## $ DATE.PUB...MM.DD.YYYY. : chr "05/02/2001" "04/07/2004" "03/29/2005" "08/15/2006" ...
## $ QUARTER.PUBLISHED : int 2 2 1 3 4 4 3 3 4 1 ...
## $ DATE.ACCESSED..MM.DD.YYYY. : chr "1/29/2019" "1/29/2019" "1/30/2018" "2/13/2018" ...
## $ VOLUME : int NA NA NA NA NA NA NA NA NA NA ...
## $ ISSUE.NUMBER : int NA NA NA NA NA NA NA NA NA NA ...
## $ PAGE : chr "" "" "" "" ...
## $ LINK : chr "http://www.polkio.com/news/2001/may/02/water-quality-focus-of-advisory-council/" "http://www.polkio.com/news/2004/apr/07/just-what-is-the-luckiamute-watershed-council/" "http://www.wweek.com/portland/article-4261-something-in-the-water.html" "http://www.wweek.com/portland/article-5953-witchcraft-for-bureaucrats.html" ...
## $ CITATION : chr "Water quality focus of advisory council. (2001, May 2).Polk County Itemizer-Observer. Retrieved from http://www"| __truncated__ "Just what is the Luckiamute Watershed Council? Polk County Itemizer-Observer. (2004, April 7).Polk County Itemi"| __truncated__ "Bundick, A. N. (2005, March 29). Something in the Water. Willamette Week. Retrieved from http://www.wweek.com/p"| __truncated__ "Brosy, A. (2006, August 15). \"Witchcraft\" For Bureaucrats. Willamette Weekly. Retrieved from http://www.wweek"| __truncated__ ...
## $ DATA.ENTRY. : chr "X" "X" "X" "X" ...
## $ FILE.NAME : chr "DA012PC122" "DA042PC123" "PO051WW043" "PO063WW028" ...
## $ ORIGINAL.FILE.NAME : chr "not scraped" "not scraped" "not scraped" "not scraped" ...
## $ CLEAN.TXT. : chr "X" "X" "" "" ...
## $ FULL.CLEANED.TEXT : chr "Water quality and availability has been in the headlines a lot lately, but as most rural residents know, water "| __truncated__ "The Luckiamute Watershed Council works to improve water quality for humans, fish and wildlife.The Luckiamute Wa"| __truncated__ "Portlanders have voted down fluoridation three times, but now the state Legislature may force it down our throa"| __truncated__ "Dick Torpey squints at the hot summer sky and slowly walks across a parking lot with two thin, yard-long rods h"| __truncated__ ...
## $ numbers.to.keep.track.of.OG.organization: int 493 494 1331 1332 2223 2357 52 2224 351 2225 ...
## $ X : chr "" "" "" "" ...
## $ X.1 : chr "" "" "" "" ...
## $ X.2 : chr "" "" "" "" ...
## $ X.3 : chr "" "" "" "" ...
## $ X.4 : chr "" "" "" "" ...
## $ X.5 : int NA NA NA NA NA NA NA NA NA NA ...
## $ X.6 : int NA NA NA NA NA NA NA NA NA NA ...
## $ X.7 : logi NA NA NA NA NA NA ...
## $ X.8 : logi NA NA NA NA NA NA ...
## $ X.9 : chr "" "" "" "" ...
## $ X.10 : chr "" "" "" "" ...
## $ X.11 : chr "" "" "" "" ...
## $ X.12 : chr "" "" "" "" ...
## $ X.13 : chr "" "" "" "" ...
## $ X.14 : chr "" "" "" "" ...
## $ X.15 : chr "" "" "" "" ...
colnames(import)[colnames(import)=="DATE.PUB...MM.DD.YYYY."] <-"when" #Date article was published
colnames(import)[colnames(import)=="NEWSPAPER"] <-"paper" #Name of publication
colnames(import)[colnames(import)=="TITLE"] <-"article" #Title of article
colnames(import)[colnames(import)=="CITATION"] <-"cit" #Citation
colnames(import)[colnames(import)=="PUB..LOCATION"] <-"place" #Location of publishing house
colnames(import)[colnames(import)=="FULL.CLEANED.TEXT"] <-"edited" #Edited article text
colnames(import)[colnames(import)=="FILE.NAME"] <-"code" #Article identifier
import <- janitor::remove_empty(import, which = "cols") #removes empty columns
import$when <- lubridate:: mdy(import$when) #categorize data as formatted date
data <- import %>%
dplyr::select(when, place, paper, article, cit, code, edited) %>% #creating a dataset with only the information I'm interested in
mutate(linenumber=row_number(edited)) %>% #creating an index to better keep track of variables
group_by(article) # for now
Oregon_stop_words <- bind_rows(tibble(word = c("water","Oregon", "city", "city's", "Bend", "Baker", "warm","springs", "oregon", "Portland", "portland", "it's"), #function to automatically add upper and lower cases?
lexicon = c("custom")),
stop_words)
tidyarticles <- data %>%
filter(when > "2015-12-31") %>%
group_by(article, place, when)
tokenarticles <- data %>%
group_by(article, place, when) %>%
unnest_tokens(word,edited)%>%
filter(when > "2015-12-31") %>%
anti_join(stop_words) %>%
anti_join(Oregon_stop_words)
tidyarticles %>%
group_by(place) %>%
distinct(article) %>%
count() %>%
arrange(desc(n)) %>%
print.data.frame()
## place n
## 1 Salem 251
## 2 Portland 245
## 3 Newport 184
## 4 Klamath Falls 176
## 5 Bend 128
## 6 Coos Bay 128
## 7 Pendleton 122
## 8 Tillamook 102
## 9 Roseburg 100
## 10 Ontario 97
## 11 Burns 68
## 12 Hood River 58
## 13 Warm Springs 56
## 14 Baker City 39
## 15 The Dalles 34
## 16 Clatskanie 31
## 17 Dallas 28
## 18 Sisters 24
## 19 Brookings 20
## 20 La Grande 20
## 21 Medford 20
## 22 Vale 20
## 23 Astoria 10
## 24 Lakeview 9
bing <- tokenarticles %>%
inner_join(get_sentiments("bing")) %>% # I'm telling R to join a column of bing sentiment values to my dataset
count(place, when, article, sentiment) %>% # creating a count column of my articles and their cumulative sentiments
spread(sentiment, n, fill=0) %>% # splitting that column based on the +/- sentiments...
mutate(sentiment = positive - negative) %>% #...to analyze them again
rename(bingraw = sentiment) %>% # and now renaming
dplyr::select(-positive, -negative) #and tidying
head(bing)%>%
arrange(desc(when))
## # A tibble: 6 x 4
## # Groups: article, place, when [6]
## article place when bingraw
## <chr> <chr> <date> <dbl>
## 1 "‘Extreme draining’ of Oregon reservoir eliminate… Portland 2019-05-21 -10
## 2 " ‘Polluted by Money’ series underscores our comm… Portland 2019-03-23 7
## 3 "‘Affordability concerns’: Costly arsenic solutio… Ontario 2018-10-09 -5
## 4 "‘Carbon pollution isn’t free’: How Oregon can ca… Portland 2017-03-09 19
## 5 "‘Connecting Past to Future’: Tribal liaison Paul… Hood Ri… 2017-01-13 0
## 6 "‘Hotter, earlier, longer’: Groups plan to sue EP… Hood Ri… 2016-08-27 0
afinn <- tokenarticles %>%
inner_join(get_sentiments("afinn")) %>%
group_by(place, when, article, value) %>%
summarise(afinnraw = sum(value)) %>% # Summarizing my article Afinn scores slightly differently here
drop_na() %>%
summarise(afinnraw = sum(afinnraw))
head(afinn)
## # A tibble: 6 x 4
## # Groups: place, when [6]
## place when article afinnraw
## <chr> <date> <chr> <dbl>
## 1 Astoria 2016-06-21 Astoria city dam likely to survive quake -9
## 2 Astoria 2016-06-22 Stormwater projects top of the list in Port of As… 15
## 3 Astoria 2017-12-21 Commercial Crabbing to Start in January 0
## 4 Astoria 2018-01-09 Oregon transportation workers spray it safe on Cl… -18
## 5 Astoria 2018-01-12 Salmon are losing their genetic diversity -3
## 6 Astoria 2018-01-15 Knappa Water Association flushing water mains -1
duolex <- data.frame(inner_join(afinn, bing))
duosent <- duolex %>%
gather(key= "sentiment", value = "scores", -c(place, when, article)) #alternatively, use pivot_longer to grab just 1 key-value pair
head(duosent)
## place when
## 1 Astoria 2016-06-21
## 2 Astoria 2016-06-22
## 3 Astoria 2017-12-21
## 4 Astoria 2018-01-09
## 5 Astoria 2018-01-12
## 6 Astoria 2018-01-15
## article
## 1 Astoria city dam likely to survive quake
## 2 Stormwater projects top of the list in Port of Astoria budget
## 3 Commercial Crabbing to Start in January
## 4 Oregon transportation workers spray it safe on Clatsop County highways in winter
## 5 Salmon are losing their genetic diversity
## 6 Knappa Water Association flushing water mains
## sentiment scores
## 1 afinnraw -9
## 2 afinnraw 15
## 3 afinnraw 0
## 4 afinnraw -18
## 5 afinnraw -3
## 6 afinnraw -1
duosent$abscores <- abs(duosent$scores) # created a new column of absolute sentiment values
duosent$perc <- rescale(duosent$scores, to=c(0,100)) # creating a new column of normalized scores - converting the entire range of sentiment scores to a 0-100 scale
duosent$overall <- ifelse(duosent$scores >0, "positive", "negative") #and yet another column of another variable
head(duosent)
## place when
## 1 Astoria 2016-06-21
## 2 Astoria 2016-06-22
## 3 Astoria 2017-12-21
## 4 Astoria 2018-01-09
## 5 Astoria 2018-01-12
## 6 Astoria 2018-01-15
## article
## 1 Astoria city dam likely to survive quake
## 2 Stormwater projects top of the list in Port of Astoria budget
## 3 Commercial Crabbing to Start in January
## 4 Oregon transportation workers spray it safe on Clatsop County highways in winter
## 5 Salmon are losing their genetic diversity
## 6 Knappa Water Association flushing water mains
## sentiment scores abscores perc overall
## 1 afinnraw -9 9 48.73646 negative
## 2 afinnraw 15 15 57.40072 positive
## 3 afinnraw 0 0 51.98556 negative
## 4 afinnraw -18 18 45.48736 negative
## 5 afinnraw -3 3 50.90253 negative
## 6 afinnraw -1 1 51.62455 negative
plotduosent <- ggplot(duosent, aes(x=when, y=scores, color=overall)) +
geom_jitter()+
geom_smooth(color="black", linetype ="dashed") +
xlab("") #I'm hiding this axis on purpose!
plotduosent+
theme_minimal()
whereplot <- ggplot(duosent, aes(x=place, y=scores))
whenplot <- ggplot(duosent, aes(x=when,y=scores))
whereplot+
# geom_bar(stat='identity', aes(levels(factor(fill=duosent$overall))), position="dodge")+ # I like to write out the code of the tasks I'm trying to accomplish even if the syntax isn't correct
geom_bar(stat = "identity", position="dodge", aes(y=abscores, fill=overall))+
theme_minimal()+
theme(axis.text.x = element_text(angle = 60))
This figure shows all positive and negative sentiment scores in all the newspapers from Oregon.
It seems like sentiment scores are pretty evenly positive and negative across most locations - Roseburg and Hood River (1 of them each!) are some of the obvious exceptions.
whenplot+
geom_bar(stat = "identity", position="stack", aes(y=scores, fill=overall))+ #I'm constructing this even though it will barely be visible
geom_smooth(method="loess")+ #chose the loess method for smoothing because of the presence of outliers
guides(fill=FALSE)+ #removed the legend
scale_x_date(date_labels = "%y")+
theme_minimal()+
theme_linedraw()+
facet_wrap(~place, ncol = 4)
Resulting in:
“LDA is a mathematical method for …. finding the mixture of words that is associated with each topic, while also determining the mixture of topics that describes each document.” Silge 2017
Provides a beta: per-topic-per-word probability
for(i in unique(tokenarticles$place)) {
nam <- paste("tidy", i, sep = "_")
assign(nam, tokenarticles[tokenarticles$place==i,])
}
tidy_dtm <- function(x){
x %>%
anti_join(stop_words) %>%
anti_join(Oregon_stop_words) %>%
count(article, word, sort = TRUE) %>%
cast_dtm(article, word, n)
}
`tidy_Warm Springs` %>%
group_by() %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 9537
Warm_Springs_dtm <- `tidy_Warm Springs` %>%
anti_join(stop_words) %>%
anti_join(Oregon_stop_words) %>%
count(article, word, sort = TRUE) %>%
cast_dtm(article, word, n)
Warm_Springs_LDA <- LDA(Warm_Springs_dtm, k = 4)
Warm_Springs_LDA
## A LDA_VEM topic model with 4 topics.
Warm_Spring_topics <- tidy(Warm_Springs_LDA, matrix = "beta")
Warm_Spring_topics
## # A tibble: 11,980 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 treatment 0.00522
## 2 2 treatment 0.00203
## 3 3 treatment 0.00216
## 4 4 treatment 0.0163
## 5 1 community 0.00106
## 6 2 community 0.00969
## 7 3 community 0.00171
## 8 4 community 0.0122
## 9 1 tribal 0.00660
## 10 2 tribal 0.0176
## # … with 11,970 more rows
Warm_Springs_top_terms <- Warm_Spring_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
Warm_Springs_top_terms
## # A tibble: 40 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 system 0.0133
## 2 1 turbidity 0.0128
## 3 1 drinking 0.0122
## 4 1 health 0.00850
## 5 1 salmon 0.00691
## 6 1 spill 0.00691
## 7 1 tribal 0.00660
## 8 1 tribe 0.00609
## 9 1 notice 0.00585
## 10 1 council 0.00563
## # … with 30 more rows
Warm_Springs_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
## # A tibble: 1 x 1
## n
## <int>
## 1 2225
## # A tibble: 1 x 1
## n
## <int>
## 1 9480
## # A tibble: 1 x 1
## n
## <int>
## 1 42358
## # A tibble: 1 x 1
## n
## <int>
## 1 4219
## # A tibble: 1 x 1
## n
## <int>
## 1 24171
## # A tibble: 1 x 1
## n
## <int>
## 1 8044
## # A tibble: 1 x 1
## n
## <int>
## 1 29778
## # A tibble: 1 x 1
## n
## <int>
## 1 5517
## # A tibble: 1 x 1
## n
## <int>
## 1 14692
## # A tibble: 1 x 1
## n
## <int>
## 1 52863
## # A tibble: 1 x 1
## n
## <int>
## 1 4444
## # A tibble: 1 x 1
## n
## <int>
## 1 1475
## # A tibble: 1 x 1
## n
## <int>
## 1 4577
## # A tibble: 1 x 1
## n
## <int>
## 1 46497
## # A tibble: 1 x 1
## n
## <int>
## 1 19247
## # A tibble: 1 x 1
## n
## <int>
## 1 35201
## # A tibble: 1 x 1
## n
## <int>
## 1 102915
## # A tibble: 1 x 1
## n
## <int>
## 1 26371
## # A tibble: 1 x 1
## n
## <int>
## 1 67946
## # A tibble: 1 x 1
## n
## <int>
## 1 7210
## # A tibble: 1 x 1
## n
## <int>
## 1 28068
## # A tibble: 1 x 1
## n
## <int>
## 1 5871
## # A tibble: 1 x 1
## n
## <int>
## 1 9558
## <<DocumentTermMatrix (documents: 128, terms: 9706)>>
## Non-/sparse entries: 28808/1213560
## Sparsity : 98%
## Maximal term length: 35
## Weighting : term frequency (tf)