## Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janeaustenr)
library(tidytext)
## Transform into tidytext format using toy data #########################
text <- c("Because I could not stop for Death -",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
text
## [1] "Because I could not stop for Death -"
## [2] "He kindly stopped for me -"
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
## This is a typical character vector that we might want to analyze.
# In order to turn it into a tidy text dataset, we first need to put it into a data frame.
## Put the string data into a data frame
text_df <- tibble(line = 1:4, text = text)
text_df
## # A tibble: 4 x 2
## line text
## <int> <chr>
## 1 1 Because I could not stop for Death -
## 2 2 He kindly stopped for me -
## 3 3 The Carriage held but just Ourselves -
## 4 4 and Immortality
#We need to both break the text into individual tokens
#(a process called tokenization) and transform it to a tidy data structure.
#To do this, we use tidytext's unnest_tokens() function.
text_df %>%
unnest_tokens(word, text)
## # A tibble: 20 x 2
## line word
## <int> <chr>
## 1 1 because
## 2 1 i
## 3 1 could
## 4 1 not
## 5 1 stop
## 6 1 for
## 7 1 death
## 8 2 he
## 9 2 kindly
## 10 2 stopped
## 11 2 for
## 12 2 me
## 13 3 the
## 14 3 carriage
## 15 3 held
## 16 3 but
## 17 3 just
## 18 3 ourselves
## 19 4 and
## 20 4 immortality
## Example 1: Kaggle disaster tweets #####################################
## Read data
disaster <- read.csv(file = 'C:/Users/ehk994/Desktop/Teaching/Text mining/Text mining I/disaster.csv', stringsAsFactors = F, header = T, na.strings = c("","NA"))
## Tranasform into tidytext
tidy_disaster <- disaster %>%
unnest_tokens(word,text)
## Use dplyr's count() to find the most common words in all the books as a whole
tidy_count <- tidy_disaster %>%
count(word, sort = TRUE)
## Create a visualization of the most common words
library(ggplot2)
tidy_disaster %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)

## Tranasform into tidytext removing stop words
tidy_disaster2 <- disaster %>%
unnest_tokens(word,text) %>%
anti_join(stop_words)
## Joining, by = "word"
## Use dplyr's count() to find the most common words in all the books as a whole
tidy_count2 <- tidy_disaster2 %>%
count(word, sort = TRUE)
## Create a visualization of the most common words
library(ggplot2)
tidy_disaster2 %>%
count(word, sort = TRUE) %>%
filter(n > 100) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)

## Pre-processing with tm package ####################################
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Transform into corpus
disaster_corpus <- Corpus(VectorSource(as.vector(disaster$text)))
disaster_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7613
## to lower case
disaster_corpus <- tm_map(disaster_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(disaster_corpus, content_transformer(tolower)):
## transformation drops documents
## Remove punctuations
disaster_corpus <- tm_map(disaster_corpus, content_transformer(removePunctuation))
## Warning in tm_map.SimpleCorpus(disaster_corpus,
## content_transformer(removePunctuation)): transformation drops documents
## Remove whitespaces
disaster_corpus <- tm_map(disaster_corpus, content_transformer(stripWhitespace))
## Warning in tm_map.SimpleCorpus(disaster_corpus,
## content_transformer(stripWhitespace)): transformation drops documents
## Remove numbers
disaster_corpus <- tm_map(disaster_corpus, content_transformer(removeNumbers))
## Warning in tm_map.SimpleCorpus(disaster_corpus,
## content_transformer(removeNumbers)): transformation drops documents
## Remove stop words and include further words that need to be removed
disaster_corpus <- tm_map(disaster_corpus, removeWords, c(stopwords("english"), "t.co", "http", "https", "â", "ã"))
## Warning in tm_map.SimpleCorpus(disaster_corpus, removeWords,
## c(stopwords("english"), : transformation drops documents
## Back to data frame
disaster_df <- data.frame(text = get("content", disaster_corpus)) #cleaned
disaster_df2 <- tibble(line = 1:7613, text = disaster_df$text)
## Transform into tidytext after pre-processing
## Tranasform into tidytext
tidy_disaster_cleaned <- disaster_df2 %>%
unnest_tokens(word,text)
## Use dplyr's count() to find the most common words in all the books as a whole
tidy_disaster_cleaned_count <- tidy_disaster_cleaned %>%
count(word, sort = TRUE)
## Create a visualization of the most common words
library(ggplot2)
tidy_disaster_cleaned %>%
count(word, sort = TRUE) %>%
filter(n > 100) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)

## Sentiment analysis ###############################################
library(tidytext)
library(janeaustenr)
library(textdata)
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
## Most common "fear" words in disaster tweets
nrc_fear <- get_sentiments("nrc") %>%
filter(sentiment == "fear")
tidy_disaster_cleaned %>%
inner_join(nrc_fear) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 513 x 2
## word n
## <chr> <int>
## 1 fire 250
## 2 emergency 157
## 3 disaster 152
## 4 police 140
## 5 crash 119
## 6 suicide 116
## 7 bomb 104
## 8 attack 99
## 9 war 90
## 10 accident 87
## # ... with 503 more rows
## Net sentiment per tweet with "bing"
library(tidyr)
disaster_sentiment <- tidy_disaster_cleaned %>%
inner_join(get_sentiments("bing")) %>%
count(index = line %/% 1, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Comparing the three sentiment dictionaries
afinn <- tidy_disaster_cleaned %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = line %/% 1) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
tidy_disaster_cleaned %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
tidy_disaster_cleaned %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = line %/% 1, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
## Visualizations: AFINN vs BING vs NRC
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")

## Most common positive and negative words for the sentiment from BING
bing_word_counts <- tidy_disaster_cleaned %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)

## Wordclouds
library(wordcloud)
## Loading required package: RColorBrewer
tidy_disaster_cleaned %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): amp could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): storm could not be fit on page.
## It will not be plotted.

library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_disaster_cleaned %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"

## tf-idf #######################################################
library(dplyr)
library(janeaustenr)
library(tidytext)
## Count a word in each document
disaster_count <- disaster_df2 %>%
unnest_tokens(word,text) %>%
count(line, word, sort = TRUE)
## Count total words for each document
total_words <- disaster_count %>%
group_by(line) %>%
summarize(total = sum(n))
disaster_count2 <- left_join(disaster_count, total_words)
## Joining, by = "line"
## bind_tf_idf function
disaster_tf_idf <- disaster_count2 %>%
bind_tf_idf(word, line, n)