Data Collection and Processing

Load Packages

library(twitteR)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(stringr)
library(tidytext)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:twitteR':
## 
##     id, location
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(tidytext)

Collect Data via Twitter API

consumer_key <- "tCvRv70t5SH32HN5vFgk8UbZB"
consumer_secret <- "KqzIDAPdTyMjgaOFSB5FIT2lsT4OlISb6UqO1tKHsT6ebA7WZ2"
access_token <- "782926177170235392-5EHVOx4nosjoMPYj43hQ6kVirJOTXyv"
access_secret <- "NzpWlexZLoQWFgpwLczrAOOooufTZ5vZzpdTR2OYdV1Cs"

setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
## [1] "Using direct authentication"
tw = twitteR::searchTwitter('#irma', n = 1e4, since = '2017-08-08', retryOnRateLimit = 1e3)
data = twitteR::twListToDF(tw)

Descriptive data via a wordcloud

Visually speaking, a word cloud allows us to get a very first hint of what data text we got

data$text <- sapply(data$text,function(row) iconv(row, "latin1", "ASCII", sub=""))
corpus <- Corpus(VectorSource(data$text))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, stemDocument)

wordcloud(corpus, max.words = 200, random.order = FALSE, colors=brewer.pal(8, "Dark2"))

Data Cleaning

################################# CLEANNING #################################
tweet <- as.character(data$text)
#removing links
tweet = gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", " ", tweet)
#retweet
tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", tweet)
# removing hashtags
tweet = gsub("#\\w+", " ", tweet)
# removing @people
tweet = gsub("@\\w+", " ", tweet)
#removing punctuations
tweet = gsub("[[:punct:]]", " ", tweet)
#removing numbers
tweet = gsub("[[:digit:]]", " ", tweet)
#removing emojis
tweet<-str_replace_all(tweet,"[^[:graph:]]"," ")
tweet <- str_replace_all(tweet,'https'," ")
tweet <- str_replace_all(tweet,'amp'," ")

tweet = gsub("[ \t]{2,}", " ", tweet)
tweet = gsub("^\\s+|\\s+$", "", tweet)
# lower case
tweet = tolower(tweet)

DF<-data.frame(data$screenName,tweet)

Data Frequency

corp <- Corpus(VectorSource(tweet))
corp <- tm_map(corp,removeWords,c(stopwords('english'),stopwords('SMART'),'required','responded'))
tdm <- TermDocumentMatrix(corp) 

freq.terms <- findFreqTerms(tdm,lowfreq=150)
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 150)
frequence <- data.frame(term = names(term.freq), freq = term.freq)
frequence1<-subset(frequence, freq > 250)
ggplot(frequence1, aes(x = term, y = freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") + coord_flip()

Sentiment Analysis

Split data into words

List <- strsplit(tweet," ")
data<-data.frame(id=rep(DF$data.screenName, sapply(List, length)), word=unlist(List)) 
#dataset structure
str(sentiments)
## Classes 'tbl_df', 'tbl' and 'data.frame':    23165 obs. of  4 variables:
##  $ word     : chr  "abacus" "abandon" "abandon" "abandon" ...
##  $ sentiment: chr  "trust" "fear" "negative" "sadness" ...
##  $ lexicon  : chr  "nrc" "nrc" "nrc" "nrc" ...
##  $ score    : int  NA NA NA NA NA NA NA NA NA NA ...
#no of entries for the different sources
sentiments %>% 
  group_by(lexicon) %>%
  summarise(noOfWords = n())
## # A tibble: 3 x 2
##   lexicon noOfWords
##     <chr>     <int>
## 1   AFINN      2476
## 2    bing      6788
## 3     nrc     13901
#NRC & BING lexicons use the `sentiment` feature
#no of words by sentiment
sentiments %>% 
  filter(lexicon %in% c("nrc", "bing")) %>%
  group_by(lexicon, sentiment) %>%
  summarise(noOfWords = n())
## Warning: package 'bindrcpp' was built under R version 3.3.2
## # A tibble: 12 x 3
## # Groups:   lexicon [?]
##    lexicon    sentiment noOfWords
##      <chr>        <chr>     <int>
##  1    bing     negative      4782
##  2    bing     positive      2006
##  3     nrc        anger      1247
##  4     nrc anticipation       839
##  5     nrc      disgust      1058
##  6     nrc         fear      1476
##  7     nrc          joy       689
##  8     nrc     negative      3324
##  9     nrc     positive      2312
## 10     nrc      sadness      1191
## 11     nrc     surprise       534
## 12     nrc        trust      1231
#AFINN lexicon uses the `score` feature
#no of words by score
sentiments %>% 
  filter(lexicon %in% c("AFINN")) %>%
  group_by(lexicon) %>%
  count(cut_width(score, 1))
## # A tibble: 11 x 3
## # Groups:   lexicon [1]
##    lexicon `cut_width(score, 1)`     n
##      <chr>                <fctr> <int>
##  1   AFINN           [-5.5,-4.5]    16
##  2   AFINN           (-4.5,-3.5]    43
##  3   AFINN           (-3.5,-2.5]   264
##  4   AFINN           (-2.5,-1.5]   965
##  5   AFINN           (-1.5,-0.5]   309
##  6   AFINN            (-0.5,0.5]     1
##  7   AFINN             (0.5,1.5]   208
##  8   AFINN             (1.5,2.5]   448
##  9   AFINN             (2.5,3.5]   172
## 10   AFINN             (3.5,4.5]    45
## 11   AFINN             (4.5,5.5]     5

AFINN

afinn_lexicon <- get_sentiments("afinn")
head(afinn_lexicon)
## # A tibble: 6 x 2
##         word score
##        <chr> <int>
## 1    abandon    -2
## 2  abandoned    -2
## 3   abandons    -2
## 4   abducted    -2
## 5  abduction    -2
## 6 abductions    -2

NRC

nrc_lexicon <- get_sentiments("nrc")
head(nrc_lexicon)
## # A tibble: 6 x 2
##        word sentiment
##       <chr>     <chr>
## 1    abacus     trust
## 2   abandon      fear
## 3   abandon  negative
## 4   abandon   sadness
## 5 abandoned     anger
## 6 abandoned      fear

BING

bing_lexicon <- get_sentiments("bing")
head(bing_lexicon)
## # A tibble: 6 x 2
##         word sentiment
##        <chr>     <chr>
## 1    2-faced  negative
## 2    2-faces  negative
## 3         a+  positive
## 4   abnormal  negative
## 5    abolish  negative
## 6 abominable  negative

Positive Sentiment

bing_positive <- get_sentiments("bing") %>%
  filter(sentiment == "positive")
positive<-data %>%
  semi_join(bing_positive) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_positive<-sum(positive$n)

positive_graph<-positive %>%
  filter(n > 20) 

ggplot(positive_graph,aes(positive_graph$word,positive_graph$n)) + geom_bar(stat="identity",fill="lightblue")+theme_bw()+coord_flip()

Negative Sentiment

bing_negative <- get_sentiments("bing") %>%
  filter(sentiment == "negative")
negative<-data %>%
  semi_join(bing_negative) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_negative<-sum(negative$n)


negative_graph<-negative %>%
  filter(n > 20) 
ggplot(negative_graph,aes(negative_graph$word,negative_graph$n)) + geom_bar(stat="identity",fill="lightblue")+theme_bw()+coord_flip()

JOY

#Using the nrc lexicon, only the words that are associated to a sentiment of `joy`
nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

#Summarize the usage of `joy` words
joy<-data %>%
  semi_join(nrc_joy) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_joy<-sum(joy$n)

ANGER

nrc_anger <- get_sentiments("nrc") %>%
  filter(sentiment == "anger")
anger<-data %>%
  semi_join(nrc_anger) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_anger<-sum(anger$n)

ANTICIPATION

nrc_anticipation <- get_sentiments("nrc") %>%
  filter(sentiment == "anticipation")
anticipation<-data %>%
  semi_join(nrc_anticipation) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_anticipation<-sum(anticipation$n)

DISGUST

nrc_disgust <- get_sentiments("nrc") %>%
  filter(sentiment == "disgust")
disgust<-data %>%
  semi_join(nrc_disgust) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_disgust<-sum(disgust$n)

FEAR

nrc_fear <- get_sentiments("nrc") %>%
  filter(sentiment == "fear")
fear<-data %>%
  semi_join(nrc_fear) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_fear<-sum(fear$n)

SADNESS

nrc_sadness <- get_sentiments("nrc") %>%
  filter(sentiment == "sadness")
sadness<-data %>%
  semi_join(nrc_sadness) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_sadness<-sum(sadness$n)

SURPRISE

nrc_surprise <- get_sentiments("nrc") %>%
  filter(sentiment == "surprise")
surprise<-data %>%
  semi_join(nrc_surprise) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_surprise<-sum(surprise$n)

TRUST

nrc_trust <- get_sentiments("nrc") %>%
  filter(sentiment == "trust")
trust<-data %>%
  semi_join(nrc_trust) %>%
  count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_trust<-sum(trust$n)
lol <- data.frame(
  group = c("Joy", "Anger", "Anticipation","Disgust","Fear","Sadness","Surprise","Trust"),
  value = c(nb_joy<-sum(joy$n),
            nb_anger<-sum(anger$n),
            nb_anticipation<-sum(anticipation$n),
            nb_disgust<-sum(disgust$n),
            nb_fear<-sum(fear$n),
            nb_sadness<-sum(sadness$n),
            nb_surprise<-sum(surprise$n),
            nb_trust<-sum(trust$n))
)
head(lol)
##          group value
## 1          Joy  1785
## 2        Anger  2246
## 3 Anticipation  2155
## 4      Disgust  1003
## 5         Fear  3778
## 6      Sadness  2389
library(ggplot2)
# Barplot
bp<- ggplot(lol, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")
bp

library(plotrix)
## Warning: package 'plotrix' was built under R version 3.3.2
value<-c(nb_joy<-sum(joy$n),
         nb_anger<-sum(anger$n),
         nb_anticipation<-sum(anticipation$n),
         nb_disgust<-sum(disgust$n),
         nb_fear<-sum(fear$n),
         nb_sadness<-sum(sadness$n),
         nb_surprise<-sum(surprise$n),
         nb_trust<-sum(trust$n))
group<-c("Joy", "Anger", "Anticipation","Disgust","Fear","Sadness","Surprise","Trust")
pie3D(value,labels=group,explode=0.1,
      main="Pie Chart of main Sentiment ")

N-grams

df_tweets<-as.data.frame(DF[,c(2)])
colnames(df_tweets)<-c("text")
tweets_bigrams <- df_tweets %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

head(tweets_bigrams)
## # A tibble: 6 x 1
##       bigram
##        <chr>
## 1     time t
## 2   t people
## 3 people who
## 4   who left
## 5 left their
## 6 their dogs
tweets_bigrams2<-tweets_bigrams %>%
  count(bigram, sort = TRUE)

head(tweets_bigrams2)
## # A tibble: 6 x 2
##           bigram     n
##            <chr> <int>
## 1     people who   548
## 2 chained during   538
## 3   dogs chained   538
## 4    face felony   538
## 5 felony charges   538
## 6     left their   538
tweets_bigrams2<-subset(tweets_bigrams2, n>250)
ggplot(tweets_bigrams2,aes(tweets_bigrams2$bigram,tweets_bigrams2$n)) + geom_bar(stat="identity",fill="lightblue")+theme_bw()+coord_flip()

bigrams_separated <- tweets_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

head(bigram_counts)
## # A tibble: 6 x 3
##    word1   word2     n
##    <chr>   <chr> <int>
## 1   dogs chained   538
## 2 felony charges   538
## 3 mexico    city   271
## 4     au     sud   262
## 5     du mexique   262
## 6     en   jours   262
bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

head(bigrams_united)
## # A tibble: 6 x 1
##              bigram
##               <chr>
## 1      dogs chained
## 2    felony charges
## 3      charges read
## 4 florida residents
## 5   flood insurance
## 6    mounting bills
bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 92 x 3
##    word1   word2     n
##    <chr>   <chr> <int>
##  1   not      to    51
##  2   not    many    40
##  3   not     the    12
##  4   not       a    10
##  5   not      in    10
##  6   not    jack     7
##  7   not inspite     5
##  8   not      sw     5
##  9   not      be     4
## 10   not   being     4
## # ... with 82 more rows
AFINN <- get_sentiments("afinn")

AFINN
## # A tibble: 2,476 x 2
##          word score
##         <chr> <int>
##  1    abandon    -2
##  2  abandoned    -2
##  3   abandons    -2
##  4   abducted    -2
##  5  abduction    -2
##  6 abductions    -2
##  7      abhor    -3
##  8   abhorred    -3
##  9  abhorrent    -3
## 10     abhors    -3
## # ... with 2,466 more rows
library(tidytext)

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE) %>%
  ungroup()

head(not_words)
## # A tibble: 6 x 3
##       word2 score     n
##       <chr> <int> <int>
## 1    forget    -1     3
## 2   helping     2     2
## 3     allow     1     1
## 4     clear     1     1
## 5 forgotten    -1     1
## 6       god     1     1
library(ggplot2)
not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

negation_words <- c("not", "no", "never", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE) %>%
  ungroup()

negated_words
## # A tibble: 18 x 4
##    word1       word2 score     n
##    <chr>       <chr> <int> <int>
##  1   not      forget    -1     3
##  2    no      matter     1     2
##  3   not     helping     2     2
##  4 never        good     3     1
##  5    no        easy     1     1
##  6    no        help     2     1
##  7    no       mercy     2     1
##  8    no    shortage    -2     1
##  9    no significant     1     1
## 10    no      thanks     2     1
## 11    no       wrong    -2     1
## 12   not       allow     1     1
## 13   not       clear     1     1
## 14   not   forgotten    -1     1
## 15   not         god     1     1
## 16   not        good     3     1
## 17   not    increase     1     1
## 18   not      suffer    -2     1
not_words<-subset(negated_words, word1 == "not")
no_words<-subset(negated_words, word1 == "no")
never_words<-subset(negated_words, word1 == "never")
without_words<-subset(negated_words, word1 == "without")
no_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(100) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"no\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(100) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:tidyr':
## 
##     %>%, crossing
## The following objects are masked from 'package:dplyr':
## 
##     %>%, as_data_frame, groups, union
## The following object is masked from 'package:stringr':
## 
##     %>%
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
# original counts
bigram_counts
## # A tibble: 15,619 x 3
##      word1   word2     n
##      <chr>   <chr> <int>
##  1    dogs chained   538
##  2  felony charges   538
##  3  mexico    city   271
##  4      au     sud   262
##  5      du mexique   262
##  6      en   jours   262
##  7   jours      au   262
##  8 mexique  mexico   262
##  9     sud      du   262
## 10  puerto    rico   207
## # ... with 15,609 more rows
bigram_graph <- bigram_counts %>%
  filter(n > 25) %>%
  graph_from_data_frame()

bigram_graph
## IGRAPH DN-- 264 238 -- 
## + attr: name (v/c), n (e/n)
## + edges (vertex names):
##  [1] dogs      ->chained felony    ->charges mexico    ->city   
##  [4] au        ->sud     du        ->mexique en        ->jours  
##  [7] jours     ->au      mexique   ->mexico  sud       ->du     
## [10] puerto    ->rico    karel     ->doorman ms        ->karel  
## [13] zr        ->ms      climate   ->change  du        ->cyclone
## [16] est       ->pas     lil       ->du      cyclone   ->stagne 
## [19] depuis    ->des     des       ->annes   dvast     ->paris  
## [22] guadeloupe->ni      mais      ->qui     ni        ->mais   
## + ... omitted several edges
library(ggraph)
## Warning: package 'ggraph' was built under R version 3.3.2
set.seed(2017)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()