For our Lab we decided to do a sentiment analysis of newspapers chosen by the keyword “Data Science”. We further filtered our data to be based on geographic regions of our choosing in the United States, and decided to include the top 100 results of all time, because in certain regions the history of the topic of data science was variable (West Coast vs. Midwest)

The 3 Regions we chose on the LexisNexis site were:

West Coast: Alaska or Arizona or California or Colorado or Hawaii or Idaho or Montana or Nevada or New Mexico or Oregon or Utah or Washington or Wyoming
Midwest: Illinois or Indiana or Iowa or Kansas or Michigan or Minnesota or Missouri or Nebraska or North Dakota or Ohio or Wisconsin
East Coast: Maine or New Hampshire or Massachusetts or Rhode Island or Connecticut or New York or New Jersey or Delaware or Maryland or Virginia or North Carolina or South Carolina or Georgia or Florida

We then further chose Newspapers for each region, and then filtered by the top 100 results in the database:

West Coast: Los Angeles Times, Eurasia, Spokesman
Midwest: Chicago Daily Herald, St. Louis Post-Dispatch (Missouri)
East Coast: New York Times, Atlanta Journal-Constitution, Spokesman Review
# Read the files into the code

## West Coast
LA_Times<- readLines("LA_Times.txt")
Eurasia<- readLines("Eurasia_Review.txt")
Spokesman<- readLines("Spokesman_Review.txt")
West_Coast_Combined<- readLines("West_Coast.txt")

## Midwest
chicago_daily_herald_data <- read_lines("Chicago_Daily_Herald_(100).txt")
st_louis_dispatch_data <- read_lines("St._Louis_Post-Dispatch_(Missouri)_(100).txt")
midwest_data <- paste(chicago_daily_herald_data, st_louis_dispatch_data) # This is the combined dataframe for the entire region

## East Coast
nyt<-read_lines("nyt.txt")
ajc<-read_lines("ajc.txt")
ap<-read_lines("ap.txt")
#Convert the files into tibbles that can be analyzed

## West Coast
LA_Times<- tibble(LA_Times)
Eurasia<- tibble(Eurasia)
Spokesman <- tibble(Spokesman)
West_Coast_Combined <- tibble(West_Coast_Combined)

## Midwest
chicago_daily_herald_data <- tibble(chicago_daily_herald_data)
st_louis_dispatch_data <- tibble(st_louis_dispatch_data)
midwest_data <- tibble(midwest_data)

## East Coast
nyt<-tibble(nyt)
ajc<-tibble(ajc)
ap<-tibble(ap)

c_nyt<-nyt%>%rename(value=nyt)
c_ajc<-ajc%>%rename(value=ajc)
c_ap<-ap%>%rename(value=ap)

east_coast_combined<-rbind((rbind(c_nyt["value"], c_ajc["value"])), c_ap["value"])
# Coerce the arguments to character types that are stripped of attributes
## West Coast
LA_Times$LA_Times <- as.character(LA_Times$LA_Times)
Eurasia$Eurasia <- as.character(Eurasia$Eurasia)
Spokesman$Spokesman <- as.character(Spokesman$Spokesman)
West_Coast_Combined$West_Coast_Combined <- as.character(West_Coast_Combined$West_Coast_Combined)

## Midwest
chicago_daily_herald_data$chicago_daily_herald_data <- as.character(chicago_daily_herald_data$chicago_daily_herald_data)
st_louis_dispatch_data$st_louis_dispatch_data <- as.character(st_louis_dispatch_data$st_louis_dispatch_data)
midwest_data$midwest_data <- as.character(midwest_data$midwest_data)

## East Coast
nyt$nyt<-as.character(nyt$nyt)
ajc$ajc<-as.character(ajc$ajc)
ap$ap<-as.character(ap$ap)
east_coast_combined$value<-as.character(east_coast_combined$value)
east_coast_combined<-east_coast_combined%>%rename(east_coast_combined=value)
# Start Tokenization process, and use anti-join to get rid of redundant phrasing in sentences

## West Coast
LA_Times <- LA_Times %>%
  unnest_tokens(word, LA_Times)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

Eurasia <- Eurasia %>%
  unnest_tokens(word, Eurasia)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

Spokesman <- Spokesman %>%
  unnest_tokens(word, Spokesman)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

West_Coast_Combined <- West_Coast_Combined %>%
  unnest_tokens(word, West_Coast_Combined)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

## Midwest
chicago_daily_herald_data <- chicago_daily_herald_data %>%
  unnest_tokens(word, chicago_daily_herald_data)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

st_louis_dispatch_data <- st_louis_dispatch_data %>%
  unnest_tokens(word, st_louis_dispatch_data)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

midwest_data <- midwest_data %>%
  unnest_tokens(word, midwest_data)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

## East Coast

nyt_tokens<-nyt%>%
  unnest_tokens(word, nyt)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

ajc_tokens<-ajc%>%
  unnest_tokens(word, ajc)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

ap_tokens<-ap%>%
  unnest_tokens(word, ap)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

east_coast_combined_tokens<-east_coast_combined%>%
  unnest_tokens(word, east_coast_combined)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

Ok, now that we have our word frequencies let’s do some analysis. We will compare the using sentiment analysis to see if the generally align or not.

# For Afinn we see a list of words and their classification, 2,467 - not really that many overall. 
# For NRC looks like a good amount more 13,891, but as we can see words are classified in several different categories. 
# For Bing looks like a good amount more 6,776, but as we can see just negative and positive. 

## West Coast

LA_sentiment_affin <- LA_Times %>%
  inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable

LA_sentiment_nrc <- LA_Times %>%
  inner_join(get_sentiments("nrc"))

LA_sentiment_bing <- LA_Times %>%
  inner_join(get_sentiments("bing"))

Spokesman_sentiment_affin <- Spokesman %>%
  inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable

Spokesman_sentiment_nrc <- Spokesman %>%
  inner_join(get_sentiments("nrc"))

Spokesman_sentiment_bing <- Spokesman %>%
  inner_join(get_sentiments("bing"))

Eurasia_sentiment_affin <- Eurasia %>%
  inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable

Eurasia_sentiment_nrc <- Eurasia %>%
  inner_join(get_sentiments("nrc"))

Eurasia_sentiment_bing <- Eurasia %>%
  inner_join(get_sentiments("bing"))

West_Coast_Combined_sentiment_affin <- West_Coast_Combined %>%
  inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable

West_Coast_Combined_sentiment_nrc <- West_Coast_Combined %>%
  inner_join(get_sentiments("nrc"))

West_Coast_Combined_sentiment_bing <- West_Coast_Combined %>%
  inner_join(get_sentiments("bing"))


## Midwest

chicago_daily_herald_sentiment_afinn <- chicago_daily_herald_data %>%
  inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable

chicago_daily_herald_sentiment_nrc <- chicago_daily_herald_data %>%
  inner_join(get_sentiments("nrc"))

chicago_daily_herald_sentiment_bing <- chicago_daily_herald_data %>%
  inner_join(get_sentiments("bing"))

st_louis_dispatch_sentiment_afinn <- st_louis_dispatch_data %>%
  inner_join(get_sentiments("afinn"))
  
st_louis_dispatch_sentiment_nrc <- st_louis_dispatch_data %>%
  inner_join(get_sentiments("nrc"))

st_louis_dispatch_sentiment_bing <- st_louis_dispatch_data %>%
  inner_join(get_sentiments("bing"))

midwest_sentiment_afinn <- midwest_data %>%
  inner_join(get_sentiments("afinn"))
  
midwest_sentiment_nrc <- midwest_data %>%
  inner_join(get_sentiments("nrc"))

midwest_sentiment_bing <- midwest_data %>%
  inner_join(get_sentiments("bing"))


## East Coast

nyt_afinn<-nyt_tokens%>%
  inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable

nyt_nrc<-nyt_tokens%>%
  inner_join(get_sentiments("nrc"))

nyt_bing<-nyt_tokens%>%
  inner_join(get_sentiments("bing"))

ajc_afinn<-ajc_tokens%>%
  inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable

ajc_nrc<-ajc_tokens%>%
  inner_join(get_sentiments("nrc"))

ajc_bing<-ajc_tokens%>%
  inner_join(get_sentiments("bing"))

ap_afinn<-ap_tokens%>%
  inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable

ap_nrc<-ap_tokens%>%
  inner_join(get_sentiments("nrc"))

ap_bing<-ap_tokens%>%
  inner_join(get_sentiments("bing"))


east_coast_combined_afinn<-east_coast_combined_tokens%>%
  inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable

east_coast_combined_nrc<-east_coast_combined_tokens%>%
  inner_join(get_sentiments("nrc"))

east_coast_combined_bing<-east_coast_combined_tokens%>%
  inner_join(get_sentiments("bing"))

Let’s take a look at some tables of the sentiments for each group:

# For West Coast
table(LA_sentiment_bing$sentiment)
## 
## negative positive 
##      124      109
table(Spokesman_sentiment_bing$sentiment)
## 
## negative positive 
##       39       27
table(Eurasia_sentiment_bing$sentiment)
## 
## negative positive 
##      508      368
table(West_Coast_Combined_sentiment_bing$sentiment)
## 
## negative positive 
##      575      413
# For Midwest
table(st_louis_dispatch_sentiment_bing$sentiment)
## 
## negative positive 
##      708      556
table(chicago_daily_herald_sentiment_bing$sentiment)
## 
## negative positive 
##      469      445
table(midwest_sentiment_bing$sentiment)
## 
## negative positive 
##      915      680
# For East Coast
table(nyt_bing$sentiment)
## 
## negative positive 
##      207      360
table(ajc_bing$sentiment)
## 
## negative positive 
##      196      128
table(ap_bing$sentiment)
## 
## negative positive 
##      320      188
table(east_coast_combined_bing$sentiment)
## 
## negative positive 
##      531      450

Shown below are the sentiment plots for each region we chose

## West Coast
ggplot(data = LA_sentiment_affin, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("LA Times Sentiment Range")+
  theme_minimal()

ggplot(data = Eurasia_sentiment_affin, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("Eurasia Review Sentiment Range")+
  theme_minimal()

ggplot(data = Spokesman_sentiment_affin, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("Spokesman Review Sentiment Range")+
  theme_minimal()

ggplot(data = West_Coast_Combined_sentiment_affin, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("West Coast Sentiment Range")+
  theme_minimal()

## Midwest
ggplot(data = chicago_daily_herald_sentiment_afinn, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("Chicago Daily Herald Sentiment Range")+
  theme_minimal()

ggplot(data = st_louis_dispatch_sentiment_afinn, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("St. Louis Post-Dispatch (Missouri) Sentiment Range")+
  theme_minimal()

ggplot(data = midwest_sentiment_afinn, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("Midwest Sentiment Range")+
  theme_minimal()

## East Coast
ggplot(data = nyt_afinn, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("New York Times Sentiment Range")+
  theme_minimal()

ggplot(data = ajc_afinn, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("Atlanta Journal-Constitution Sentiment Range")+
  theme_minimal()

ggplot(data = ap_afinn, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("Spokesman Review Sentiment Range")+
  theme_minimal()

ggplot(data = east_coast_combined_afinn, 
       aes(x=value)
        )+
  geom_histogram()+
  ggtitle("East Coast Sentiment Range")+
  theme_minimal()

Shown below are the word clouds for each region we chose

## West Coast

set.seed(42)
ggplot(LA_Times[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

ggplot(Eurasia[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

ggplot(Spokesman[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

ggplot(West_Coast_Combined[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

## Midwest

set.seed(42)
ggplot(chicago_daily_herald_data[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

ggplot(st_louis_dispatch_data[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

ggplot(midwest_data[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

## East Coast
ggplot(nyt_tokens[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

ggplot(ajc_tokens[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

ggplot(ap_tokens[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

ggplot(east_coast_combined_tokens[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

As you can see, our word cloud for each region has the keywords “Data” and “Science” in them, which shows that our combined analysis of the regions has some merit.

Finally, we ran a tf-idf for each newspaper in the region. We then ran the tf-idf between each overall region with the newspapers combined by region to get an better understanding of values.

First, here are the tf-idfs for each newspaper in the region.

# Read the values in properly

## West Coast
LA_Times_raw <- as_tibble(readLines("LA_Times.txt"))
Eurasia_raw <- as_tibble(readLines("Eurasia_Review.txt"))
Spokesman_raw <- as_tibble(readLines("Spokesman_Review.txt"))

West_Coast_Combined_raw <- as_tibble(readLines("West_Coast.txt"))

## Midwest
chicago_daily_herald_data_raw <- as_tibble(read_lines("Chicago_Daily_Herald_(100).txt"))
st_louis_dispatch_data_raw <- as_tibble(read_lines("St._Louis_Post-Dispatch_(Missouri)_(100).txt"))

midwest_combined_raw <-rbind(chicago_daily_herald_data_raw, st_louis_dispatch_data_raw)

## East Coast
nyt_raw<-as_tibble(read_lines("nyt.txt"))
ajc_raw<-as_tibble(read_lines("ajc.txt"))
ap_raw<-as_tibble(read_lines("ap.txt"))

east_coast_combined_raw<-rbind((rbind(nyt_raw["value"], ajc_raw["value"])), ap_raw["value"])


# Get a data preparation function ready to ensure inputs are correct to tf_idf function
data_prep <- function(x,y,z){
  i <- as_tibble(t(x))
  ii <- unite(i,"text",y:z,remove = TRUE,sep = "")
}

# Prep the data for comparison
## West Coast
LA_Times_bag <- data_prep(LA_Times_raw, 'V1','V479')
Eurasia_bag <- data_prep(Eurasia_raw, 'V1','V3849')
Spokesman_bag <- data_prep(Spokesman_raw, 'V1','V226')
West_Coast_Combined_bag <- data_prep(West_Coast_Combined_raw, 'V1','V4554')

## Midwest
chicago_daily_herald_data_bag <- data_prep(chicago_daily_herald_data_raw,'V1','V4306')
st_louis_dispatch_data_bag <- data_prep(st_louis_dispatch_data_raw,'V1','V6067')
midwest_combined_bag <- data_prep(midwest_combined_raw,'V1','V10373')

## East Coast
nyt_bag<-data_prep(nyt_raw, 'V1','V4956')
ajc_bag<-data_prep(ajc_raw, 'V1','V1022')
ap_bag<-data_prep(ap_raw, 'V1','V1593')
east_coast_combined_bag<-data_prep(east_coast_combined_raw, 'V1','V7571')
# For West Coast
Sources <- c("LA","Eurasia", "Spokesman")

tf_idf_text <- tibble(Sources,text=t(tibble(LA_Times_bag, Eurasia_bag, Spokesman_bag, .name_repair = "universal")))

class(tf_idf_text)
## [1] "tbl_df"     "tbl"        "data.frame"
word_count <- tf_idf_text %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(Sources, word, sort = TRUE)

total_words <- word_count %>% 
  group_by(Sources) %>% 
  summarize(total = sum(n))

Article_words <- left_join(word_count, total_words)

Article_words <- Article_words %>%
  bind_tf_idf(word, Sources, n)

Article_words
## # A tibble: 12,524 x 7
##    Sources word             n total      tf   idf  tf_idf
##    <chr>   <chr>        <int> <int>   <dbl> <dbl>   <dbl>
##  1 Eurasia data           402 34747 0.0116  0     0      
##  2 Eurasia ai             263 34747 0.00757 0.405 0.00307
##  3 Eurasia study          141 34747 0.00406 0.405 0.00165
##  4 Eurasia intelligence   138 34747 0.00397 0.405 0.00161
##  5 Eurasia science        134 34747 0.00386 0     0      
##  6 Eurasia future         121 34747 0.00348 0.405 0.00141
##  7 Eurasia researchers    121 34747 0.00348 0.405 0.00141
##  8 Eurasia research       113 34747 0.00325 0     0      
##  9 Eurasia 2018           108 34747 0.00311 0     0      
## 10 Eurasia strategic      100 34747 0.00288 1.10  0.00316
## # … with 12,514 more rows
# For Midwest
midwest_newspapers <- c("Chicago Daily Herald (CDH)", "St. Louis Post-Dispatch (SLP-D)")

tf_idf_text <- tibble(midwest_newspapers,text=t(tibble(chicago_daily_herald_data_bag,st_louis_dispatch_data_bag,.name_repair = "universal")))

class(tf_idf_text)
## [1] "tbl_df"     "tbl"        "data.frame"
word_count <- tf_idf_text %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(midwest_newspapers, word, sort = TRUE)


total_words <- word_count %>% 
  group_by(midwest_newspapers) %>% 
  summarize(total = sum(n))

inag_words <- left_join(word_count, total_words)


inag_words <- inag_words %>%
  bind_tf_idf(word, midwest_newspapers, n)
inag_words
## # A tibble: 22,753 x 7
##    midwest_newspapers              word          n total      tf   idf tf_idf
##    <chr>                           <chr>     <int> <int>   <dbl> <dbl>  <dbl>
##  1 Chicago Daily Herald (CDH)      science     826 53322 0.0155      0      0
##  2 Chicago Daily Herald (CDH)      school      502 53322 0.00941     0      0
##  3 St. Louis Post-Dispatch (SLP-D) favorite    480 63203 0.00759     0      0
##  4 St. Louis Post-Dispatch (SLP-D) louis       479 63203 0.00758     0      0
##  5 Chicago Daily Herald (CDH)      students    471 53322 0.00883     0      0
##  6 Chicago Daily Herald (CDH)      data        467 53322 0.00876     0      0
##  7 St. Louis Post-Dispatch (SLP-D) st          456 63203 0.00721     0      0
##  8 Chicago Daily Herald (CDH)      bachelor    428 53322 0.00803     0      0
##  9 St. Louis Post-Dispatch (SLP-D) academic    398 63203 0.00630     0      0
## 10 Chicago Daily Herald (CDH)      education   366 53322 0.00686     0      0
## # … with 22,743 more rows
# For East Coast
sources<-c("NYT","AJC", "AP")
tf_idf_text<-tibble(sources,text=t(tibble(nyt_bag, ajc_bag, ap_bag, .name_repair = "universal")))

word_count<-tf_idf_text %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(sources, word, sort = TRUE)

total_words<-word_count %>% 
  group_by(sources) %>% 
  summarize(total = sum(n))

news_words<-left_join(word_count, total_words)

news_words<-news_words %>%
  bind_tf_idf(word, sources, n)

news_words
## # A tibble: 16,960 x 7
##    sources word          n total      tf   idf  tf_idf
##    <chr>   <chr>     <int> <int>   <dbl> <dbl>   <dbl>
##  1 NYT     data       1661 50026 0.0332  0     0      
##  2 NYT     science    1156 50026 0.0231  0     0      
##  3 NYT     2021        418 50026 0.00836 0     0      
##  4 NYT     business    326 50026 0.00652 0     0      
##  5 NYT     march       196 50026 0.00392 0     0      
##  6 NYT     analytics   183 50026 0.00366 0     0      
##  7 NYT     ai          180 50026 0.00360 0.405 0.00146
##  8 NYT     students    173 50026 0.00346 0.405 0.00140
##  9 NYT     price       169 50026 0.00338 0.405 0.00137
## 10 NYT     learning    164 50026 0.00328 0     0      
## # … with 16,950 more rows

And now, let’s compare the tf-idfs between regions

# For East Coast
sources<-c("West Coast","Midwest", "East Coast")
tf_idf_text<-tibble(sources,text=t(tibble(West_Coast_Combined_bag, midwest_combined_bag, east_coast_combined_bag, .name_repair = "universal")))

word_count<-tf_idf_text %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(sources, word, sort = TRUE)

total_words<-word_count %>% 
  group_by(sources) %>% 
  summarize(total = sum(n))

news_words<-left_join(word_count, total_words)

news_words<-news_words %>%
  bind_tf_idf(word, sources, n)

news_words
## # A tibble: 41,767 x 7
##    sources    word         n  total      tf   idf  tf_idf
##    <chr>      <chr>    <int>  <int>   <dbl> <dbl>   <dbl>
##  1 East Coast data      1900  73553 0.0258  0     0      
##  2 East Coast science   1273  73553 0.0173  0     0      
##  3 Midwest    science   1188 116524 0.0102  0     0      
##  4 Midwest    school     763 116524 0.00655 0     0      
##  5 Midwest    data       730 116524 0.00626 0     0      
##  6 Midwest    students   646 116524 0.00554 0     0      
##  7 Midwest    college    613 116524 0.00526 0     0      
##  8 Midwest    favorite   482 116524 0.00414 0.405 0.00168
##  9 Midwest    louis      480 116524 0.00412 0.405 0.00167
## 10 Midwest    st         467 116524 0.00401 0     0      
## # … with 41,757 more rows