## Load libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(janeaustenr)
library(tidytext)
## Transform into tidytext format using toy data #########################
text <- c("Because I could not stop for Death -",
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")

text
## [1] "Because I could not stop for Death -"  
## [2] "He kindly stopped for me -"            
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
## This is a typical character vector that we might want to analyze. 
# In order to turn it into a tidy text dataset, we first need to put it into a data frame.

## Put the string data into a data frame
text_df <- tibble(line = 1:4, text = text)

text_df
## # A tibble: 4 x 2
##    line text                                  
##   <int> <chr>                                 
## 1     1 Because I could not stop for Death -  
## 2     2 He kindly stopped for me -            
## 3     3 The Carriage held but just Ourselves -
## 4     4 and Immortality
#We need to both break the text into individual tokens 
#(a process called tokenization) and transform it to a tidy data structure. 
#To do this, we use tidytext's unnest_tokens() function.
text_df %>%
  unnest_tokens(word, text)
## # A tibble: 20 x 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 for        
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality
## Example 1: Kaggle disaster tweets #####################################
## Read data
disaster <- read.csv(file = 'C:/Users/ehk994/Desktop/Teaching/Text mining/Text mining I/disaster.csv', stringsAsFactors = F, header = T, na.strings = c("","NA"))

## Tranasform into tidytext
tidy_disaster <- disaster %>%
          unnest_tokens(word,text)
## Use dplyr's count() to find the most common words in all the books as a whole
tidy_count <- tidy_disaster %>%
         count(word, sort = TRUE)
## Create a visualization of the most common words
library(ggplot2)

tidy_disaster %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

## Tranasform into tidytext removing stop words
tidy_disaster2 <- disaster %>%
  unnest_tokens(word,text) %>%
  anti_join(stop_words)
## Joining, by = "word"
## Use dplyr's count() to find the most common words in all the books as a whole
tidy_count2 <- tidy_disaster2 %>%
            count(word, sort = TRUE)
## Create a visualization of the most common words
library(ggplot2)

tidy_disaster2 %>%
  count(word, sort = TRUE) %>%
  filter(n > 100) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

## Pre-processing with tm package ####################################
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## Transform into corpus
disaster_corpus <- Corpus(VectorSource(as.vector(disaster$text))) 
disaster_corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 7613
## to lower case
disaster_corpus <- tm_map(disaster_corpus,  content_transformer(tolower)) 
## Warning in tm_map.SimpleCorpus(disaster_corpus, content_transformer(tolower)):
## transformation drops documents
## Remove punctuations
disaster_corpus <- tm_map(disaster_corpus, content_transformer(removePunctuation))
## Warning in tm_map.SimpleCorpus(disaster_corpus,
## content_transformer(removePunctuation)): transformation drops documents
## Remove whitespaces
disaster_corpus <- tm_map(disaster_corpus, content_transformer(stripWhitespace))
## Warning in tm_map.SimpleCorpus(disaster_corpus,
## content_transformer(stripWhitespace)): transformation drops documents
## Remove numbers
disaster_corpus <- tm_map(disaster_corpus, content_transformer(removeNumbers))
## Warning in tm_map.SimpleCorpus(disaster_corpus,
## content_transformer(removeNumbers)): transformation drops documents
## Remove stop words and include further words that need to be removed
disaster_corpus <- tm_map(disaster_corpus, removeWords, c(stopwords("english"), "t.co", "http", "https", "â", "ã"))
## Warning in tm_map.SimpleCorpus(disaster_corpus, removeWords,
## c(stopwords("english"), : transformation drops documents
## Back to data frame
disaster_df <- data.frame(text = get("content", disaster_corpus)) #cleaned

disaster_df2 <- tibble(line = 1:7613, text = disaster_df$text)
## Transform into tidytext after pre-processing
## Tranasform into tidytext
tidy_disaster_cleaned <- disaster_df2 %>%
         unnest_tokens(word,text)
## Use dplyr's count() to find the most common words in all the books as a whole
tidy_disaster_cleaned_count <- tidy_disaster_cleaned %>%
                          count(word, sort = TRUE)
## Create a visualization of the most common words
library(ggplot2)

tidy_disaster_cleaned %>%
  count(word, sort = TRUE) %>%
  filter(n > 100) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

## Sentiment analysis ###############################################
library(tidytext)
library(janeaustenr)
library(textdata)
get_sentiments("afinn")
## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,891 more rows
## Most common "fear" words in disaster tweets
nrc_fear <- get_sentiments("nrc") %>% 
  filter(sentiment == "fear")

tidy_disaster_cleaned %>%
  inner_join(nrc_fear) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 513 x 2
##    word          n
##    <chr>     <int>
##  1 fire        250
##  2 emergency   157
##  3 disaster    152
##  4 police      140
##  5 crash       119
##  6 suicide     116
##  7 bomb        104
##  8 attack       99
##  9 war          90
## 10 accident     87
## # ... with 503 more rows
## Net sentiment per tweet with "bing"
library(tidyr)

disaster_sentiment <- tidy_disaster_cleaned %>%
  inner_join(get_sentiments("bing")) %>%
  count(index = line %/% 1, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Comparing the three sentiment dictionaries
afinn <- tidy_disaster_cleaned %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = line %/% 1) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
  tidy_disaster_cleaned %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  tidy_disaster_cleaned %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = line %/% 1, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
## Visualizations: AFINN vs BING vs NRC
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

## Most common positive and negative words for the sentiment from BING
bing_word_counts <- tidy_disaster_cleaned %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

## Wordclouds
library(wordcloud)
## Loading required package: RColorBrewer
tidy_disaster_cleaned %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): amp could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): storm could not be fit on page.
## It will not be plotted.

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_disaster_cleaned %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

## tf-idf #######################################################
library(dplyr)
library(janeaustenr)
library(tidytext)

## Count a word in each document
disaster_count <- disaster_df2 %>%
  unnest_tokens(word,text) %>%
  count(line, word, sort = TRUE)

## Count total words for each document
total_words <- disaster_count %>% 
  group_by(line) %>% 
  summarize(total = sum(n))

disaster_count2 <- left_join(disaster_count, total_words)
## Joining, by = "line"
## bind_tf_idf function
disaster_tf_idf <- disaster_count2 %>%
  bind_tf_idf(word, line, n)