knitr::opts_chunk$set(echo = TRUE)
require("dplyr")
require("tidytext")
require("textdata")
require("widyr")
library("tibble")
library("ggplot2")
library("ggpubr")
setwd("~/Documents/ESCP/Consumer Insight Analytics/Berlin Semester")
tweets <- read.csv("tweets.csv")
tweets$X<- NULL
# select relevant variables
tweets_data <-tweets %>% select(text, screenName, created)
text_df <- mutate(tweets_data, text = as.character(tweets_data$text))
skimr::skim(tweets_data)
Name | tweets_data |
Number of rows | 2000 |
Number of columns | 3 |
_______________________ | |
Column type frequency: | |
factor | 3 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
text | 0 | 1 | FALSE | 1433 | RT : 85, RT : 66, RT : 42, RT : 31 |
screenName | 0 | 1 | FALSE | 1801 | Hot: 14, RAD: 8, mac: 6, Que: 6 |
created | 0 | 1 | FALSE | 1872 | 201: 4, 201: 4, 201: 3, 201: 3 |
# Tokenize
tidy_tweets <- text_df %>% unnest_tokens(word, text)
# Clean
cleantidytweets <- anti_join(tidy_tweets, get_stopwords())
cleantidytweets <- filter(cleantidytweets, nchar(cleantidytweets$word)>2)
cleantidytweets <- filter(cleantidytweets, word !="http" & word !="https" & word !="facebook" & word !="amp" & word !="t.co" & word !="https_" & !grepl("[[:alpha:]][[:digit:]]",word))
tweets.count <- count(cleantidytweets, word, sort = TRUE)
tweets.count$word <- reorder(tweets.count$word, tweets.count$n)
top20 <- head(tweets.count,20)
ggplot(top20, aes(x = word, y = n)) +geom_col() +coord_flip() +theme_pubclean()
wordcloud::wordcloud(tweets.count$word, tweets.count$n, min.freq = 1, max.words = 50, random.order=FALSE)
We can see from the above that the most frequent word is socialmedia. The other top 5 are data, new, like and page, in that order. The last three words of that list aren’t very significant in terms of semantics. In the word cloud we notice a lot of words are related to terrorism such as terrorist, hamas and attacks.
tweets.tf_idf<- count(cleantidytweets, screenName, word, sort = TRUE)
tweets.tf_idf <- tweets.tf_idf[tweets.tf_idf$n >4,]
tweets.tf_idf <- bind_tf_idf(tweets.tf_idf,word, screenName, n)
tweets.tf_idf<- tweets.tf_idf[which(!duplicated(tweets.tf_idf$word)),]
wordcloud::wordcloud(tweets.tf_idf$word, tweets.tf_idf$tf_idf, min.freq = 1, max.words = 30, random.order=FALSE)
With Tf-Idf, the top 5 most important words have changed completely.
tweets.sen <- inner_join(cleantidytweets, get_sentiments("nrc"), by = "word")
affin <- inner_join(cleantidytweets, get_sentiments("afinn"), by = "word")
tweets.sen <- inner_join(tweets.sen, get_sentiments("afinn"), by = "word")
head(tweets.sen, 10)
## screenName created word sentiment value
## 1 Gregory52594205 2018-05-16 14:02:40 peaceful anticipation 2
## 2 Gregory52594205 2018-05-16 14:02:40 peaceful joy 2
## 3 Gregory52594205 2018-05-16 14:02:40 peaceful positive 2
## 4 Gregory52594205 2018-05-16 14:02:40 peaceful surprise 2
## 5 Gregory52594205 2018-05-16 14:02:40 peaceful trust 2
## 6 Joy105com 2018-05-16 14:02:36 fake negative -3
## 7 achi_narahashi 2018-05-16 14:02:20 fake negative -3
## 8 elefandoms 2018-05-16 14:02:12 peace anticipation 2
## 9 elefandoms 2018-05-16 14:02:12 peace joy 2
## 10 elefandoms 2018-05-16 14:02:12 peace positive 2
tweets.sen_count <- count(tweets.sen, sentiment, word, sort = TRUE)
tweets.sen_count$word <- reorder(tweets.sen_count$word, tweets.sen_count$n)
tweets.sen_count <- by(tweets.sen_count, tweets.sen_count["sentiment"], head, n=5)
tweets.sen_count <- Reduce(rbind, tweets.sen_count)
ggplot(tweets.sen_count, aes(x = reorder(sentiment,n), y = n, fill = sentiment))+geom_col(show.legend = FALSE) +labs(y = "Sentiment Importance", x = NULL) +coord_flip() +theme_pubclean()
The three most important sentiments are positive, trust and anticipation, all which are associated with positive sentiments.
ggplot(tweets.sen_count, aes(x = word, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free") +
labs(y = "Contribution to sentiment", x = NULL) +
coord_flip() +
theme_pubclean()
Here we see the top 5 words which contribute the most to each sentiment. For example, the top 5 most contributive words for the sentimet anger are illegal, hate, abuse, violence and fight, in that order
mean(tweets.sen$value)
## [1] 0.3604888
mean(affin$value)
## [1] 0.2879153
Looks like the posts about Facebook are more positive than negative.