library(twitteR)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(stringr)
library(tidytext)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:twitteR':
##
## id, location
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(tidytext)
consumer_key <- "tCvRv70t5SH32HN5vFgk8UbZB"
consumer_secret <- "KqzIDAPdTyMjgaOFSB5FIT2lsT4OlISb6UqO1tKHsT6ebA7WZ2"
access_token <- "782926177170235392-5EHVOx4nosjoMPYj43hQ6kVirJOTXyv"
access_secret <- "NzpWlexZLoQWFgpwLczrAOOooufTZ5vZzpdTR2OYdV1Cs"
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
## [1] "Using direct authentication"
tw = twitteR::searchTwitter('#irma', n = 1e4, since = '2017-08-08', retryOnRateLimit = 1e3)
data = twitteR::twListToDF(tw)
Visually speaking, a word cloud allows us to get a very first hint of what data text we got
data$text <- sapply(data$text,function(row) iconv(row, "latin1", "ASCII", sub=""))
corpus <- Corpus(VectorSource(data$text))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, stemDocument)
wordcloud(corpus, max.words = 200, random.order = FALSE, colors=brewer.pal(8, "Dark2"))
################################# CLEANNING #################################
tweet <- as.character(data$text)
#removing links
tweet = gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", " ", tweet)
#retweet
tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", tweet)
# removing hashtags
tweet = gsub("#\\w+", " ", tweet)
# removing @people
tweet = gsub("@\\w+", " ", tweet)
#removing punctuations
tweet = gsub("[[:punct:]]", " ", tweet)
#removing numbers
tweet = gsub("[[:digit:]]", " ", tweet)
#removing emojis
tweet<-str_replace_all(tweet,"[^[:graph:]]"," ")
tweet <- str_replace_all(tweet,'https'," ")
tweet <- str_replace_all(tweet,'amp'," ")
tweet = gsub("[ \t]{2,}", " ", tweet)
tweet = gsub("^\\s+|\\s+$", "", tweet)
# lower case
tweet = tolower(tweet)
DF<-data.frame(data$screenName,tweet)
corp <- Corpus(VectorSource(tweet))
corp <- tm_map(corp,removeWords,c(stopwords('english'),stopwords('SMART'),'required','responded'))
tdm <- TermDocumentMatrix(corp)
freq.terms <- findFreqTerms(tdm,lowfreq=150)
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 150)
frequence <- data.frame(term = names(term.freq), freq = term.freq)
frequence1<-subset(frequence, freq > 250)
ggplot(frequence1, aes(x = term, y = freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") + coord_flip()
List <- strsplit(tweet," ")
data<-data.frame(id=rep(DF$data.screenName, sapply(List, length)), word=unlist(List))
#dataset structure
str(sentiments)
## Classes 'tbl_df', 'tbl' and 'data.frame': 23165 obs. of 4 variables:
## $ word : chr "abacus" "abandon" "abandon" "abandon" ...
## $ sentiment: chr "trust" "fear" "negative" "sadness" ...
## $ lexicon : chr "nrc" "nrc" "nrc" "nrc" ...
## $ score : int NA NA NA NA NA NA NA NA NA NA ...
#no of entries for the different sources
sentiments %>%
group_by(lexicon) %>%
summarise(noOfWords = n())
## # A tibble: 3 x 2
## lexicon noOfWords
## <chr> <int>
## 1 AFINN 2476
## 2 bing 6788
## 3 nrc 13901
#NRC & BING lexicons use the `sentiment` feature
#no of words by sentiment
sentiments %>%
filter(lexicon %in% c("nrc", "bing")) %>%
group_by(lexicon, sentiment) %>%
summarise(noOfWords = n())
## Warning: package 'bindrcpp' was built under R version 3.3.2
## # A tibble: 12 x 3
## # Groups: lexicon [?]
## lexicon sentiment noOfWords
## <chr> <chr> <int>
## 1 bing negative 4782
## 2 bing positive 2006
## 3 nrc anger 1247
## 4 nrc anticipation 839
## 5 nrc disgust 1058
## 6 nrc fear 1476
## 7 nrc joy 689
## 8 nrc negative 3324
## 9 nrc positive 2312
## 10 nrc sadness 1191
## 11 nrc surprise 534
## 12 nrc trust 1231
#AFINN lexicon uses the `score` feature
#no of words by score
sentiments %>%
filter(lexicon %in% c("AFINN")) %>%
group_by(lexicon) %>%
count(cut_width(score, 1))
## # A tibble: 11 x 3
## # Groups: lexicon [1]
## lexicon `cut_width(score, 1)` n
## <chr> <fctr> <int>
## 1 AFINN [-5.5,-4.5] 16
## 2 AFINN (-4.5,-3.5] 43
## 3 AFINN (-3.5,-2.5] 264
## 4 AFINN (-2.5,-1.5] 965
## 5 AFINN (-1.5,-0.5] 309
## 6 AFINN (-0.5,0.5] 1
## 7 AFINN (0.5,1.5] 208
## 8 AFINN (1.5,2.5] 448
## 9 AFINN (2.5,3.5] 172
## 10 AFINN (3.5,4.5] 45
## 11 AFINN (4.5,5.5] 5
afinn_lexicon <- get_sentiments("afinn")
head(afinn_lexicon)
## # A tibble: 6 x 2
## word score
## <chr> <int>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
nrc_lexicon <- get_sentiments("nrc")
head(nrc_lexicon)
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
bing_lexicon <- get_sentiments("bing")
head(bing_lexicon)
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 2-faced negative
## 2 2-faces negative
## 3 a+ positive
## 4 abnormal negative
## 5 abolish negative
## 6 abominable negative
bing_positive <- get_sentiments("bing") %>%
filter(sentiment == "positive")
positive<-data %>%
semi_join(bing_positive) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_positive<-sum(positive$n)
positive_graph<-positive %>%
filter(n > 20)
ggplot(positive_graph,aes(positive_graph$word,positive_graph$n)) + geom_bar(stat="identity",fill="lightblue")+theme_bw()+coord_flip()
bing_negative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
negative<-data %>%
semi_join(bing_negative) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_negative<-sum(negative$n)
negative_graph<-negative %>%
filter(n > 20)
ggplot(negative_graph,aes(negative_graph$word,negative_graph$n)) + geom_bar(stat="identity",fill="lightblue")+theme_bw()+coord_flip()
#Using the nrc lexicon, only the words that are associated to a sentiment of `joy`
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
#Summarize the usage of `joy` words
joy<-data %>%
semi_join(nrc_joy) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_joy<-sum(joy$n)
nrc_anger <- get_sentiments("nrc") %>%
filter(sentiment == "anger")
anger<-data %>%
semi_join(nrc_anger) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_anger<-sum(anger$n)
nrc_anticipation <- get_sentiments("nrc") %>%
filter(sentiment == "anticipation")
anticipation<-data %>%
semi_join(nrc_anticipation) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_anticipation<-sum(anticipation$n)
nrc_disgust <- get_sentiments("nrc") %>%
filter(sentiment == "disgust")
disgust<-data %>%
semi_join(nrc_disgust) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_disgust<-sum(disgust$n)
nrc_fear <- get_sentiments("nrc") %>%
filter(sentiment == "fear")
fear<-data %>%
semi_join(nrc_fear) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_fear<-sum(fear$n)
nrc_sadness <- get_sentiments("nrc") %>%
filter(sentiment == "sadness")
sadness<-data %>%
semi_join(nrc_sadness) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_sadness<-sum(sadness$n)
nrc_surprise <- get_sentiments("nrc") %>%
filter(sentiment == "surprise")
surprise<-data %>%
semi_join(nrc_surprise) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_surprise<-sum(surprise$n)
nrc_trust <- get_sentiments("nrc") %>%
filter(sentiment == "trust")
trust<-data %>%
semi_join(nrc_trust) %>%
count(word, sort = T)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
nb_trust<-sum(trust$n)
lol <- data.frame(
group = c("Joy", "Anger", "Anticipation","Disgust","Fear","Sadness","Surprise","Trust"),
value = c(nb_joy<-sum(joy$n),
nb_anger<-sum(anger$n),
nb_anticipation<-sum(anticipation$n),
nb_disgust<-sum(disgust$n),
nb_fear<-sum(fear$n),
nb_sadness<-sum(sadness$n),
nb_surprise<-sum(surprise$n),
nb_trust<-sum(trust$n))
)
head(lol)
## group value
## 1 Joy 1785
## 2 Anger 2246
## 3 Anticipation 2155
## 4 Disgust 1003
## 5 Fear 3778
## 6 Sadness 2389
library(ggplot2)
# Barplot
bp<- ggplot(lol, aes(x="", y=value, fill=group))+
geom_bar(width = 1, stat = "identity")
bp
library(plotrix)
## Warning: package 'plotrix' was built under R version 3.3.2
value<-c(nb_joy<-sum(joy$n),
nb_anger<-sum(anger$n),
nb_anticipation<-sum(anticipation$n),
nb_disgust<-sum(disgust$n),
nb_fear<-sum(fear$n),
nb_sadness<-sum(sadness$n),
nb_surprise<-sum(surprise$n),
nb_trust<-sum(trust$n))
group<-c("Joy", "Anger", "Anticipation","Disgust","Fear","Sadness","Surprise","Trust")
pie3D(value,labels=group,explode=0.1,
main="Pie Chart of main Sentiment ")
df_tweets<-as.data.frame(DF[,c(2)])
colnames(df_tweets)<-c("text")
tweets_bigrams <- df_tweets %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
head(tweets_bigrams)
## # A tibble: 6 x 1
## bigram
## <chr>
## 1 time t
## 2 t people
## 3 people who
## 4 who left
## 5 left their
## 6 their dogs
tweets_bigrams2<-tweets_bigrams %>%
count(bigram, sort = TRUE)
head(tweets_bigrams2)
## # A tibble: 6 x 2
## bigram n
## <chr> <int>
## 1 people who 548
## 2 chained during 538
## 3 dogs chained 538
## 4 face felony 538
## 5 felony charges 538
## 6 left their 538
tweets_bigrams2<-subset(tweets_bigrams2, n>250)
ggplot(tweets_bigrams2,aes(tweets_bigrams2$bigram,tweets_bigrams2$n)) + geom_bar(stat="identity",fill="lightblue")+theme_bw()+coord_flip()
bigrams_separated <- tweets_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
head(bigram_counts)
## # A tibble: 6 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 dogs chained 538
## 2 felony charges 538
## 3 mexico city 271
## 4 au sud 262
## 5 du mexique 262
## 6 en jours 262
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
head(bigrams_united)
## # A tibble: 6 x 1
## bigram
## <chr>
## 1 dogs chained
## 2 felony charges
## 3 charges read
## 4 florida residents
## 5 flood insurance
## 6 mounting bills
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 92 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not to 51
## 2 not many 40
## 3 not the 12
## 4 not a 10
## 5 not in 10
## 6 not jack 7
## 7 not inspite 5
## 8 not sw 5
## 9 not be 4
## 10 not being 4
## # ... with 82 more rows
AFINN <- get_sentiments("afinn")
AFINN
## # A tibble: 2,476 x 2
## word score
## <chr> <int>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,466 more rows
library(tidytext)
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
head(not_words)
## # A tibble: 6 x 3
## word2 score n
## <chr> <int> <int>
## 1 forget -1 3
## 2 helping 2 2
## 3 allow 1 1
## 4 clear 1 1
## 5 forgotten -1 1
## 6 god 1 1
library(ggplot2)
not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE) %>%
ungroup()
negated_words
## # A tibble: 18 x 4
## word1 word2 score n
## <chr> <chr> <int> <int>
## 1 not forget -1 3
## 2 no matter 1 2
## 3 not helping 2 2
## 4 never good 3 1
## 5 no easy 1 1
## 6 no help 2 1
## 7 no mercy 2 1
## 8 no shortage -2 1
## 9 no significant 1 1
## 10 no thanks 2 1
## 11 no wrong -2 1
## 12 not allow 1 1
## 13 not clear 1 1
## 14 not forgotten -1 1
## 15 not god 1 1
## 16 not good 3 1
## 17 not increase 1 1
## 18 not suffer -2 1
not_words<-subset(negated_words, word1 == "not")
no_words<-subset(negated_words, word1 == "no")
never_words<-subset(negated_words, word1 == "never")
without_words<-subset(negated_words, word1 == "without")
no_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(100) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"no\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(100) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:tidyr':
##
## %>%, crossing
## The following objects are masked from 'package:dplyr':
##
## %>%, as_data_frame, groups, union
## The following object is masked from 'package:stringr':
##
## %>%
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
# original counts
bigram_counts
## # A tibble: 15,619 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 dogs chained 538
## 2 felony charges 538
## 3 mexico city 271
## 4 au sud 262
## 5 du mexique 262
## 6 en jours 262
## 7 jours au 262
## 8 mexique mexico 262
## 9 sud du 262
## 10 puerto rico 207
## # ... with 15,609 more rows
bigram_graph <- bigram_counts %>%
filter(n > 25) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH DN-- 264 238 --
## + attr: name (v/c), n (e/n)
## + edges (vertex names):
## [1] dogs ->chained felony ->charges mexico ->city
## [4] au ->sud du ->mexique en ->jours
## [7] jours ->au mexique ->mexico sud ->du
## [10] puerto ->rico karel ->doorman ms ->karel
## [13] zr ->ms climate ->change du ->cyclone
## [16] est ->pas lil ->du cyclone ->stagne
## [19] depuis ->des des ->annes dvast ->paris
## [22] guadeloupe->ni mais ->qui ni ->mais
## + ... omitted several edges
library(ggraph)
## Warning: package 'ggraph' was built under R version 3.3.2
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()