library(tidyverse)
library(tidytext)
library(ggwordcloud)
library(gutenbergr)
library(textdata)

1 Reading in Text for Different Regions

# Read in Text for Articles from Southern Region (Julia)
south <- read_lines("~/Desktop/DS3001/DS 3001/07_text_mining/southern_region")
south <- tibble(south)
south$south <- as.character(south$south)

south <- south %>%
  unnest_tokens(word, south)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

# Read in Text for Articles from Western Region (Jess)
west <- read_lines("~/Desktop/DS3001/DS 3001/07_text_mining/west_coast.txt")
west <- tibble(west)
west$west <- as.character(west$west)

west <- west %>%
  unnest_tokens(word, west)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

# Read in Text for Articles from Midwestern Region (Kara)
midwest <- read_lines("~/Desktop/DS3001/DS 3001/07_text_mining/midwest_region")
midwest <- tibble(midwest)
midwest$midwest <- as.character(midwest$midwest)

midwest <- midwest %>%
  unnest_tokens(word, midwest)%>%
  anti_join(stop_words)%>% 
  count(word, sort=TRUE)

2 Sentiments for Different Regions

# Sentiments for Southern Region Articles
south_sentiment_affin <- south %>%
  inner_join(get_sentiments("afinn"))
View(south_sentiment_affin)

south_sentiment_nrc <- south %>%
  inner_join(get_sentiments("nrc"))
View(south_sentiment_nrc)

south_sentiment_bing <- south %>%
  inner_join(get_sentiments("bing"))
View(south_sentiment_bing)
# Negative sentiments are 179 and Positive sentiments are 105

# Sentiments for Western Region Articles
west_sentiment_affin <- west %>%
  inner_join(get_sentiments("afinn"))
View(west_sentiment_affin)

west_sentiment_nrc <- west %>%
  inner_join(get_sentiments("nrc"))
View(west_sentiment_nrc)

west_sentiment_bing <- west %>%
  inner_join(get_sentiments("bing"))
View(west_sentiment_bing)

# Negative sentiments are 161 and Positive sentiments are 81

# Sentiments for Midwestern Region Article
midwest_sentiment_affin <- midwest %>%
  inner_join(get_sentiments("afinn"))
View(midwest_sentiment_affin)

midwest_sentiment_nrc <- midwest %>%
  inner_join(get_sentiments("nrc"))
View(midwest_sentiment_nrc)

midwest_sentiment_bing <- midwest %>%
  inner_join(get_sentiments("bing"))
View(midwest_sentiment_bing)

# Negative sentiments are 159 and Positive sentiments are 79

# Comparison of Regions
table(south_sentiment_bing$sentiment)
## 
## negative positive 
##      179      105
table(west_sentiment_bing$sentiment)
## 
## negative positive 
##      161       81
table(midwest_sentiment_bing$sentiment)
## 
## negative positive 
##      159       79
table(south_sentiment_nrc$sentiment)
## 
##        anger anticipation      disgust         fear          joy     negative 
##           74           94           43           89           62          170 
##     positive      sadness     surprise        trust 
##          231           63           48          137
table(west_sentiment_nrc$sentiment)
## 
##        anger anticipation      disgust         fear          joy     negative 
##           59           69           37           82           44          137 
##     positive      sadness     surprise        trust 
##          187           58           33          116
table(midwest_sentiment_nrc$sentiment)
## 
##        anger anticipation      disgust         fear          joy     negative 
##           68           84           30           88           53          142 
##     positive      sadness     surprise        trust 
##          189           70           41          108

Conclusion: For the Southern Region, I saw there were

3 Plot of Sentiments for Different Regions

# Plot of Southern Region's Sentiment Range
ggplot(data = south_sentiment_affin, 
       aes(x=value)
        )+
  geom_histogram(bins=20)+
  ggtitle("Southern Region Sentiment Range")+
  theme_minimal()

# Plot of Western Region's Sentiment Range
ggplot(data = west_sentiment_affin, 
       aes(x=value)
        )+
  geom_histogram(bins=20)+
  ggtitle("Western Region Sentiment Range")+
  theme_minimal()

# Plot of Midwestern Region's Sentiment Range
ggplot(data = midwest_sentiment_affin, 
       aes(x=value)
        )+
  geom_histogram(bins=20)+
  ggtitle("Midwestern Region Sentiment Range")+
  theme_minimal()

4 Word Clouds for Different Regions

# Word Cloud for Southern Region
set.seed(42)
ggplot(south[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

# Word Cloud for Western Region
set.seed(42)
ggplot(west[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

# Word Cloud for Midwestern Region
set.seed(42)
ggplot(midwest[1:50,], aes(label = word, size = n)
       ) +
  geom_text_wordcloud() +
  theme_minimal()

Conclusion: For the Southern Region, words like “climate”, “change”, “global”, “warming”, “energy”, “republicans”, “conservatives”, and “carbon”, were among some of the more common occurring words in the articles from this region. For word sentiments for the Southern Region, there were 179 negative sentiments and 105 positive sentiments. This shows that there is a bit more negative language regarding climate change. The positive sentiment number was a bit higher than I expected, however. The reason the positive sentiment could be a bit higher may be due to the fact that some individuals in the Southern region do not see climate change as an important of an issue compared to other regions. The Southern region has several Red states and despite changing beliefs over the years, Republicans have historically denied the gravity of climate change. This is why I think there is not as much of a difference between negative and positive sentiment compared to that of the Western region, for example. For many Republicans, climate change is viewed as a political issue. For this reason, I was not surprised to see the words “republicans” and “conservatives” in larger text on the word cloud. Many Republican politicians, such as President Donald Trump, have used concerning language about climate change, denying or downplaying the gravity of climate change. Some of these politicians have even blocked or rolled back policies to combat climate change. For the Southern region, which has a large Republican population, many individuals continue to see climate change as a political issue which is supported by the text mining I performed for this region.

5 Term Frequency - Inverse Document Frequency for Regions

# Term Frequency - Inverse Document Frequency (tf-idf)

south_raw <- as.tibble(read_lines("~/Desktop/DS3001/DS 3001/07_text_mining/southern_region"))

west_raw <- as.tibble(read_lines("~/Desktop/DS3001/DS 3001/07_text_mining/west_coast.txt"))

midwest_raw <- as.tibble(read_lines("~/Desktop/DS3001/DS 3001/07_text_mining/midwest_region"))

data_prep <- function(x,y,z){
  i <- as_tibble(t(x))
  ii <- unite(i,"text",y:z,remove = TRUE,sep = "")
}

south_bag <- data_prep(south_raw[1:371,1],'V1','V371')

west_bag <- data_prep(west_raw,'V1','V141')

midwest_bag <- data_prep(midwest_raw,'V1','V204')

region <- c("South","West","Midwest")


tf_idf_text <- tibble(region,text=t(tibble(south_bag,west_bag,midwest_bag,.name_repair = "universal")))

View(tf_idf_text)

word_count <- tf_idf_text %>%
  unnest_tokens(word, text) %>%
  count(region, word, sort = TRUE)


total_words <- word_count %>% 
  group_by(region) %>% 
  summarize(total = sum(n))

region_words <- left_join(word_count, total_words)

region_words <- region_words %>%
  bind_tf_idf(word, region, n)

region_words
## # A tibble: 7,288 × 7
##    region  word      n total     tf   idf tf_idf
##    <chr>   <chr> <int> <int>  <dbl> <dbl>  <dbl>
##  1 South   the     498  8857 0.0562     0      0
##  2 West    the     467  8091 0.0577     0      0
##  3 Midwest the     421  7749 0.0543     0      0
##  4 South   to      258  8857 0.0291     0      0
##  5 Midwest to      251  7749 0.0324     0      0
##  6 South   of      247  8857 0.0279     0      0
##  7 West    of      236  8091 0.0292     0      0
##  8 West    and     228  8091 0.0282     0      0
##  9 South   and     220  8857 0.0248     0      0
## 10 West    to      218  8091 0.0269     0      0
## # … with 7,278 more rows
top10 <- region_words%>%
  arrange(desc(tf_idf))%>%
  group_by(region)%>%
  slice(1:10)

fig <- ggplot(top10, aes(tf_idf, word, fill = tf_idf)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~region, scales = "free_y") +
  labs(x = "Regions and Most Impactful Words",
       y = NULL)
fig