For our Lab we decided to do a sentiment analysis of newspapers chosen by the keyword “Data Science”. We further filtered our data to be based on geographic regions of our choosing in the United States, and decided to include the top 100 results of all time, because in certain regions the history of the topic of data science was variable (West Coast vs. Midwest)
The 3 Regions we chose on the LexisNexis site were:
West Coast: Alaska or Arizona or California or Colorado or Hawaii or Idaho or Montana or Nevada or New Mexico or Oregon or Utah or Washington or Wyoming
Midwest: Illinois or Indiana or Iowa or Kansas or Michigan or Minnesota or Missouri or Nebraska or North Dakota or Ohio or Wisconsin
East Coast: Maine or New Hampshire or Massachusetts or Rhode Island or Connecticut or New York or New Jersey or Delaware or Maryland or Virginia or North Carolina or South Carolina or Georgia or Florida
We then further chose Newspapers for each region, and then filtered by the top 100 results in the database:
West Coast: Los Angeles Times, Eurasia, Spokesman
Midwest: Chicago Daily Herald, St. Louis Post-Dispatch (Missouri)
East Coast: New York Times, Atlanta Journal-Constitution, Spokesman Review
# Read the files into the code
## West Coast
LA_Times<- readLines("LA_Times.txt")
Eurasia<- readLines("Eurasia_Review.txt")
Spokesman<- readLines("Spokesman_Review.txt")
West_Coast_Combined<- readLines("West_Coast.txt")
## Midwest
chicago_daily_herald_data <- read_lines("Chicago_Daily_Herald_(100).txt")
st_louis_dispatch_data <- read_lines("St._Louis_Post-Dispatch_(Missouri)_(100).txt")
midwest_data <- paste(chicago_daily_herald_data, st_louis_dispatch_data) # This is the combined dataframe for the entire region
## East Coast
nyt<-read_lines("nyt.txt")
ajc<-read_lines("ajc.txt")
ap<-read_lines("ap.txt")
#Convert the files into tibbles that can be analyzed
## West Coast
LA_Times<- tibble(LA_Times)
Eurasia<- tibble(Eurasia)
Spokesman <- tibble(Spokesman)
West_Coast_Combined <- tibble(West_Coast_Combined)
## Midwest
chicago_daily_herald_data <- tibble(chicago_daily_herald_data)
st_louis_dispatch_data <- tibble(st_louis_dispatch_data)
midwest_data <- tibble(midwest_data)
## East Coast
nyt<-tibble(nyt)
ajc<-tibble(ajc)
ap<-tibble(ap)
c_nyt<-nyt%>%rename(value=nyt)
c_ajc<-ajc%>%rename(value=ajc)
c_ap<-ap%>%rename(value=ap)
east_coast_combined<-rbind((rbind(c_nyt["value"], c_ajc["value"])), c_ap["value"])
# Coerce the arguments to character types that are stripped of attributes
## West Coast
LA_Times$LA_Times <- as.character(LA_Times$LA_Times)
Eurasia$Eurasia <- as.character(Eurasia$Eurasia)
Spokesman$Spokesman <- as.character(Spokesman$Spokesman)
West_Coast_Combined$West_Coast_Combined <- as.character(West_Coast_Combined$West_Coast_Combined)
## Midwest
chicago_daily_herald_data$chicago_daily_herald_data <- as.character(chicago_daily_herald_data$chicago_daily_herald_data)
st_louis_dispatch_data$st_louis_dispatch_data <- as.character(st_louis_dispatch_data$st_louis_dispatch_data)
midwest_data$midwest_data <- as.character(midwest_data$midwest_data)
## East Coast
nyt$nyt<-as.character(nyt$nyt)
ajc$ajc<-as.character(ajc$ajc)
ap$ap<-as.character(ap$ap)
east_coast_combined$value<-as.character(east_coast_combined$value)
east_coast_combined<-east_coast_combined%>%rename(east_coast_combined=value)
# Start Tokenization process, and use anti-join to get rid of redundant phrasing in sentences
## West Coast
LA_Times <- LA_Times %>%
unnest_tokens(word, LA_Times)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
Eurasia <- Eurasia %>%
unnest_tokens(word, Eurasia)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
Spokesman <- Spokesman %>%
unnest_tokens(word, Spokesman)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
West_Coast_Combined <- West_Coast_Combined %>%
unnest_tokens(word, West_Coast_Combined)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
## Midwest
chicago_daily_herald_data <- chicago_daily_herald_data %>%
unnest_tokens(word, chicago_daily_herald_data)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
st_louis_dispatch_data <- st_louis_dispatch_data %>%
unnest_tokens(word, st_louis_dispatch_data)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
midwest_data <- midwest_data %>%
unnest_tokens(word, midwest_data)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
## East Coast
nyt_tokens<-nyt%>%
unnest_tokens(word, nyt)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
ajc_tokens<-ajc%>%
unnest_tokens(word, ajc)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
ap_tokens<-ap%>%
unnest_tokens(word, ap)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
east_coast_combined_tokens<-east_coast_combined%>%
unnest_tokens(word, east_coast_combined)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
Ok, now that we have our word frequencies let’s do some analysis. We will compare the using sentiment analysis to see if the generally align or not.
# For Afinn we see a list of words and their classification, 2,467 - not really that many overall.
# For NRC looks like a good amount more 13,891, but as we can see words are classified in several different categories.
# For Bing looks like a good amount more 6,776, but as we can see just negative and positive.
## West Coast
LA_sentiment_affin <- LA_Times %>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
LA_sentiment_nrc <- LA_Times %>%
inner_join(get_sentiments("nrc"))
LA_sentiment_bing <- LA_Times %>%
inner_join(get_sentiments("bing"))
Spokesman_sentiment_affin <- Spokesman %>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
Spokesman_sentiment_nrc <- Spokesman %>%
inner_join(get_sentiments("nrc"))
Spokesman_sentiment_bing <- Spokesman %>%
inner_join(get_sentiments("bing"))
Eurasia_sentiment_affin <- Eurasia %>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
Eurasia_sentiment_nrc <- Eurasia %>%
inner_join(get_sentiments("nrc"))
Eurasia_sentiment_bing <- Eurasia %>%
inner_join(get_sentiments("bing"))
West_Coast_Combined_sentiment_affin <- West_Coast_Combined %>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
West_Coast_Combined_sentiment_nrc <- West_Coast_Combined %>%
inner_join(get_sentiments("nrc"))
West_Coast_Combined_sentiment_bing <- West_Coast_Combined %>%
inner_join(get_sentiments("bing"))
## Midwest
chicago_daily_herald_sentiment_afinn <- chicago_daily_herald_data %>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
chicago_daily_herald_sentiment_nrc <- chicago_daily_herald_data %>%
inner_join(get_sentiments("nrc"))
chicago_daily_herald_sentiment_bing <- chicago_daily_herald_data %>%
inner_join(get_sentiments("bing"))
st_louis_dispatch_sentiment_afinn <- st_louis_dispatch_data %>%
inner_join(get_sentiments("afinn"))
st_louis_dispatch_sentiment_nrc <- st_louis_dispatch_data %>%
inner_join(get_sentiments("nrc"))
st_louis_dispatch_sentiment_bing <- st_louis_dispatch_data %>%
inner_join(get_sentiments("bing"))
midwest_sentiment_afinn <- midwest_data %>%
inner_join(get_sentiments("afinn"))
midwest_sentiment_nrc <- midwest_data %>%
inner_join(get_sentiments("nrc"))
midwest_sentiment_bing <- midwest_data %>%
inner_join(get_sentiments("bing"))
## East Coast
nyt_afinn<-nyt_tokens%>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
nyt_nrc<-nyt_tokens%>%
inner_join(get_sentiments("nrc"))
nyt_bing<-nyt_tokens%>%
inner_join(get_sentiments("bing"))
ajc_afinn<-ajc_tokens%>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
ajc_nrc<-ajc_tokens%>%
inner_join(get_sentiments("nrc"))
ajc_bing<-ajc_tokens%>%
inner_join(get_sentiments("bing"))
ap_afinn<-ap_tokens%>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
ap_nrc<-ap_tokens%>%
inner_join(get_sentiments("nrc"))
ap_bing<-ap_tokens%>%
inner_join(get_sentiments("bing"))
east_coast_combined_afinn<-east_coast_combined_tokens%>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
east_coast_combined_nrc<-east_coast_combined_tokens%>%
inner_join(get_sentiments("nrc"))
east_coast_combined_bing<-east_coast_combined_tokens%>%
inner_join(get_sentiments("bing"))
Let’s take a look at some tables of the sentiments for each group:
# For West Coast
table(LA_sentiment_bing$sentiment)
##
## negative positive
## 124 109
table(Spokesman_sentiment_bing$sentiment)
##
## negative positive
## 39 27
table(Eurasia_sentiment_bing$sentiment)
##
## negative positive
## 508 368
table(West_Coast_Combined_sentiment_bing$sentiment)
##
## negative positive
## 575 413
# For Midwest
table(st_louis_dispatch_sentiment_bing$sentiment)
##
## negative positive
## 708 556
table(chicago_daily_herald_sentiment_bing$sentiment)
##
## negative positive
## 469 445
table(midwest_sentiment_bing$sentiment)
##
## negative positive
## 915 680
# For East Coast
table(nyt_bing$sentiment)
##
## negative positive
## 207 360
table(ajc_bing$sentiment)
##
## negative positive
## 196 128
table(ap_bing$sentiment)
##
## negative positive
## 320 188
table(east_coast_combined_bing$sentiment)
##
## negative positive
## 531 450
Shown below are the sentiment plots for each region we chose
## West Coast
ggplot(data = LA_sentiment_affin,
aes(x=value)
)+
geom_histogram()+
ggtitle("LA Times Sentiment Range")+
theme_minimal()

ggplot(data = Eurasia_sentiment_affin,
aes(x=value)
)+
geom_histogram()+
ggtitle("Eurasia Review Sentiment Range")+
theme_minimal()

ggplot(data = Spokesman_sentiment_affin,
aes(x=value)
)+
geom_histogram()+
ggtitle("Spokesman Review Sentiment Range")+
theme_minimal()

ggplot(data = West_Coast_Combined_sentiment_affin,
aes(x=value)
)+
geom_histogram()+
ggtitle("West Coast Sentiment Range")+
theme_minimal()

## Midwest
ggplot(data = chicago_daily_herald_sentiment_afinn,
aes(x=value)
)+
geom_histogram()+
ggtitle("Chicago Daily Herald Sentiment Range")+
theme_minimal()

ggplot(data = st_louis_dispatch_sentiment_afinn,
aes(x=value)
)+
geom_histogram()+
ggtitle("St. Louis Post-Dispatch (Missouri) Sentiment Range")+
theme_minimal()

ggplot(data = midwest_sentiment_afinn,
aes(x=value)
)+
geom_histogram()+
ggtitle("Midwest Sentiment Range")+
theme_minimal()

## East Coast
ggplot(data = nyt_afinn,
aes(x=value)
)+
geom_histogram()+
ggtitle("New York Times Sentiment Range")+
theme_minimal()

ggplot(data = ajc_afinn,
aes(x=value)
)+
geom_histogram()+
ggtitle("Atlanta Journal-Constitution Sentiment Range")+
theme_minimal()

ggplot(data = ap_afinn,
aes(x=value)
)+
geom_histogram()+
ggtitle("Spokesman Review Sentiment Range")+
theme_minimal()

ggplot(data = east_coast_combined_afinn,
aes(x=value)
)+
geom_histogram()+
ggtitle("East Coast Sentiment Range")+
theme_minimal()

Shown below are the word clouds for each region we chose
## West Coast
set.seed(42)
ggplot(LA_Times[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

ggplot(Eurasia[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

ggplot(Spokesman[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

ggplot(West_Coast_Combined[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

## Midwest
set.seed(42)
ggplot(chicago_daily_herald_data[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

ggplot(st_louis_dispatch_data[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

ggplot(midwest_data[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

## East Coast
ggplot(nyt_tokens[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

ggplot(ajc_tokens[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

ggplot(ap_tokens[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

ggplot(east_coast_combined_tokens[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()

As you can see, our word cloud for each region has the keywords “Data” and “Science” in them, which shows that our combined analysis of the regions has some merit.
Finally, we ran a tf-idf for each newspaper in the region. We then ran the tf-idf between each overall region with the newspapers combined by region to get an better understanding of values.
First, here are the tf-idfs for each newspaper in the region.
# Read the values in properly
## West Coast
LA_Times_raw <- as_tibble(readLines("LA_Times.txt"))
Eurasia_raw <- as_tibble(readLines("Eurasia_Review.txt"))
Spokesman_raw <- as_tibble(readLines("Spokesman_Review.txt"))
West_Coast_Combined_raw <- as_tibble(readLines("West_Coast.txt"))
## Midwest
chicago_daily_herald_data_raw <- as_tibble(read_lines("Chicago_Daily_Herald_(100).txt"))
st_louis_dispatch_data_raw <- as_tibble(read_lines("St._Louis_Post-Dispatch_(Missouri)_(100).txt"))
midwest_combined_raw <-rbind(chicago_daily_herald_data_raw, st_louis_dispatch_data_raw)
## East Coast
nyt_raw<-as_tibble(read_lines("nyt.txt"))
ajc_raw<-as_tibble(read_lines("ajc.txt"))
ap_raw<-as_tibble(read_lines("ap.txt"))
east_coast_combined_raw<-rbind((rbind(nyt_raw["value"], ajc_raw["value"])), ap_raw["value"])
# Get a data preparation function ready to ensure inputs are correct to tf_idf function
data_prep <- function(x,y,z){
i <- as_tibble(t(x))
ii <- unite(i,"text",y:z,remove = TRUE,sep = "")
}
# Prep the data for comparison
## West Coast
LA_Times_bag <- data_prep(LA_Times_raw, 'V1','V479')
Eurasia_bag <- data_prep(Eurasia_raw, 'V1','V3849')
Spokesman_bag <- data_prep(Spokesman_raw, 'V1','V226')
West_Coast_Combined_bag <- data_prep(West_Coast_Combined_raw, 'V1','V4554')
## Midwest
chicago_daily_herald_data_bag <- data_prep(chicago_daily_herald_data_raw,'V1','V4306')
st_louis_dispatch_data_bag <- data_prep(st_louis_dispatch_data_raw,'V1','V6067')
midwest_combined_bag <- data_prep(midwest_combined_raw,'V1','V10373')
## East Coast
nyt_bag<-data_prep(nyt_raw, 'V1','V4956')
ajc_bag<-data_prep(ajc_raw, 'V1','V1022')
ap_bag<-data_prep(ap_raw, 'V1','V1593')
east_coast_combined_bag<-data_prep(east_coast_combined_raw, 'V1','V7571')
# For West Coast
Sources <- c("LA","Eurasia", "Spokesman")
tf_idf_text <- tibble(Sources,text=t(tibble(LA_Times_bag, Eurasia_bag, Spokesman_bag, .name_repair = "universal")))
class(tf_idf_text)
## [1] "tbl_df" "tbl" "data.frame"
word_count <- tf_idf_text %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(Sources, word, sort = TRUE)
total_words <- word_count %>%
group_by(Sources) %>%
summarize(total = sum(n))
Article_words <- left_join(word_count, total_words)
Article_words <- Article_words %>%
bind_tf_idf(word, Sources, n)
Article_words
## # A tibble: 12,524 x 7
## Sources word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Eurasia data 402 34747 0.0116 0 0
## 2 Eurasia ai 263 34747 0.00757 0.405 0.00307
## 3 Eurasia study 141 34747 0.00406 0.405 0.00165
## 4 Eurasia intelligence 138 34747 0.00397 0.405 0.00161
## 5 Eurasia science 134 34747 0.00386 0 0
## 6 Eurasia future 121 34747 0.00348 0.405 0.00141
## 7 Eurasia researchers 121 34747 0.00348 0.405 0.00141
## 8 Eurasia research 113 34747 0.00325 0 0
## 9 Eurasia 2018 108 34747 0.00311 0 0
## 10 Eurasia strategic 100 34747 0.00288 1.10 0.00316
## # … with 12,514 more rows
# For Midwest
midwest_newspapers <- c("Chicago Daily Herald (CDH)", "St. Louis Post-Dispatch (SLP-D)")
tf_idf_text <- tibble(midwest_newspapers,text=t(tibble(chicago_daily_herald_data_bag,st_louis_dispatch_data_bag,.name_repair = "universal")))
class(tf_idf_text)
## [1] "tbl_df" "tbl" "data.frame"
word_count <- tf_idf_text %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(midwest_newspapers, word, sort = TRUE)
total_words <- word_count %>%
group_by(midwest_newspapers) %>%
summarize(total = sum(n))
inag_words <- left_join(word_count, total_words)
inag_words <- inag_words %>%
bind_tf_idf(word, midwest_newspapers, n)
inag_words
## # A tibble: 22,753 x 7
## midwest_newspapers word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Chicago Daily Herald (CDH) science 826 53322 0.0155 0 0
## 2 Chicago Daily Herald (CDH) school 502 53322 0.00941 0 0
## 3 St. Louis Post-Dispatch (SLP-D) favorite 480 63203 0.00759 0 0
## 4 St. Louis Post-Dispatch (SLP-D) louis 479 63203 0.00758 0 0
## 5 Chicago Daily Herald (CDH) students 471 53322 0.00883 0 0
## 6 Chicago Daily Herald (CDH) data 467 53322 0.00876 0 0
## 7 St. Louis Post-Dispatch (SLP-D) st 456 63203 0.00721 0 0
## 8 Chicago Daily Herald (CDH) bachelor 428 53322 0.00803 0 0
## 9 St. Louis Post-Dispatch (SLP-D) academic 398 63203 0.00630 0 0
## 10 Chicago Daily Herald (CDH) education 366 53322 0.00686 0 0
## # … with 22,743 more rows
# For East Coast
sources<-c("NYT","AJC", "AP")
tf_idf_text<-tibble(sources,text=t(tibble(nyt_bag, ajc_bag, ap_bag, .name_repair = "universal")))
word_count<-tf_idf_text %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(sources, word, sort = TRUE)
total_words<-word_count %>%
group_by(sources) %>%
summarize(total = sum(n))
news_words<-left_join(word_count, total_words)
news_words<-news_words %>%
bind_tf_idf(word, sources, n)
news_words
## # A tibble: 16,960 x 7
## sources word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 NYT data 1661 50026 0.0332 0 0
## 2 NYT science 1156 50026 0.0231 0 0
## 3 NYT 2021 418 50026 0.00836 0 0
## 4 NYT business 326 50026 0.00652 0 0
## 5 NYT march 196 50026 0.00392 0 0
## 6 NYT analytics 183 50026 0.00366 0 0
## 7 NYT ai 180 50026 0.00360 0.405 0.00146
## 8 NYT students 173 50026 0.00346 0.405 0.00140
## 9 NYT price 169 50026 0.00338 0.405 0.00137
## 10 NYT learning 164 50026 0.00328 0 0
## # … with 16,950 more rows
And now, let’s compare the tf-idfs between regions
# For East Coast
sources<-c("West Coast","Midwest", "East Coast")
tf_idf_text<-tibble(sources,text=t(tibble(West_Coast_Combined_bag, midwest_combined_bag, east_coast_combined_bag, .name_repair = "universal")))
word_count<-tf_idf_text %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(sources, word, sort = TRUE)
total_words<-word_count %>%
group_by(sources) %>%
summarize(total = sum(n))
news_words<-left_join(word_count, total_words)
news_words<-news_words %>%
bind_tf_idf(word, sources, n)
news_words
## # A tibble: 41,767 x 7
## sources word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 East Coast data 1900 73553 0.0258 0 0
## 2 East Coast science 1273 73553 0.0173 0 0
## 3 Midwest science 1188 116524 0.0102 0 0
## 4 Midwest school 763 116524 0.00655 0 0
## 5 Midwest data 730 116524 0.00626 0 0
## 6 Midwest students 646 116524 0.00554 0 0
## 7 Midwest college 613 116524 0.00526 0 0
## 8 Midwest favorite 482 116524 0.00414 0.405 0.00168
## 9 Midwest louis 480 116524 0.00412 0.405 0.00167
## 10 Midwest st 467 116524 0.00401 0 0
## # … with 41,757 more rows