knitr::opts_chunk$set(echo = TRUE)
require("dplyr")
require("tidytext")
require("textdata")
require("widyr")
library("tibble")
library("ggplot2")
library("ggpubr")

setwd("~/Documents/ESCP/Consumer Insight Analytics/Berlin Semester")
tweets <- read.csv("tweets.csv")
tweets$X<- NULL

Summary of variables

# select relevant variables
tweets_data <-tweets %>% select(text, screenName, created)
text_df <-  mutate(tweets_data, text = as.character(tweets_data$text))
skimr::skim(tweets_data)
Data summary
Name tweets_data
Number of rows 2000
Number of columns 3
_______________________
Column type frequency:
factor 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
text 0 1 FALSE 1433 RT : 85, RT : 66, RT : 42, RT : 31
screenName 0 1 FALSE 1801 Hot: 14, RAD: 8, mac: 6, Que: 6
created 0 1 FALSE 1872 201: 4, 201: 4, 201: 3, 201: 3

Tokenize and Clean

# Tokenize
tidy_tweets <- text_df %>% unnest_tokens(word, text)
# Clean
cleantidytweets <- anti_join(tidy_tweets, get_stopwords())
cleantidytweets <- filter(cleantidytweets, nchar(cleantidytweets$word)>2)
cleantidytweets <- filter(cleantidytweets, word !="http" & word !="https" & word !="facebook" & word !="amp" & word !="t.co" & word !="https_" & !grepl("[[:alpha:]][[:digit:]]",word))

Most Frequent Words Appearing in the Corpus

tweets.count <- count(cleantidytweets, word, sort = TRUE) 
tweets.count$word <- reorder(tweets.count$word, tweets.count$n)
top20 <- head(tweets.count,20)
ggplot(top20, aes(x = word, y = n)) +geom_col() +coord_flip() +theme_pubclean()

Word Cloud

wordcloud::wordcloud(tweets.count$word, tweets.count$n, min.freq = 1, max.words = 50, random.order=FALSE)

We can see from the above that the most frequent word is socialmedia. The other top 5 are data, new, like and page, in that order. The last three words of that list aren’t very significant in terms of semantics. In the word cloud we notice a lot of words are related to terrorism such as terrorist, hamas and attacks.

Tf-Idf

tweets.tf_idf<- count(cleantidytweets, screenName, word, sort = TRUE) 
tweets.tf_idf <- tweets.tf_idf[tweets.tf_idf$n >4,]
tweets.tf_idf <- bind_tf_idf(tweets.tf_idf,word, screenName, n)

Word Cloud for Tf-Idf

tweets.tf_idf<- tweets.tf_idf[which(!duplicated(tweets.tf_idf$word)),]
wordcloud::wordcloud(tweets.tf_idf$word, tweets.tf_idf$tf_idf, min.freq = 1, max.words = 30, random.order=FALSE)

With Tf-Idf, the top 5 most important words have changed completely.

Sentiment Analysis

tweets.sen <- inner_join(cleantidytweets, get_sentiments("nrc"), by = "word")
affin <- inner_join(cleantidytweets, get_sentiments("afinn"), by = "word")
tweets.sen <- inner_join(tweets.sen, get_sentiments("afinn"), by = "word")
head(tweets.sen, 10)
##         screenName             created     word    sentiment value
## 1  Gregory52594205 2018-05-16 14:02:40 peaceful anticipation     2
## 2  Gregory52594205 2018-05-16 14:02:40 peaceful          joy     2
## 3  Gregory52594205 2018-05-16 14:02:40 peaceful     positive     2
## 4  Gregory52594205 2018-05-16 14:02:40 peaceful     surprise     2
## 5  Gregory52594205 2018-05-16 14:02:40 peaceful        trust     2
## 6        Joy105com 2018-05-16 14:02:36     fake     negative    -3
## 7   achi_narahashi 2018-05-16 14:02:20     fake     negative    -3
## 8       elefandoms 2018-05-16 14:02:12    peace anticipation     2
## 9       elefandoms 2018-05-16 14:02:12    peace          joy     2
## 10      elefandoms 2018-05-16 14:02:12    peace     positive     2
tweets.sen_count <- count(tweets.sen, sentiment, word, sort = TRUE)
tweets.sen_count$word <- reorder(tweets.sen_count$word, tweets.sen_count$n)
tweets.sen_count <- by(tweets.sen_count, tweets.sen_count["sentiment"], head, n=5)
tweets.sen_count <- Reduce(rbind, tweets.sen_count)

ggplot(tweets.sen_count, aes(x = reorder(sentiment,n), y = n, fill = sentiment))+geom_col(show.legend = FALSE) +labs(y = "Sentiment Importance", x = NULL) +coord_flip() +theme_pubclean()

The three most important sentiments are positive, trust and anticipation, all which are associated with positive sentiments.

ggplot(tweets.sen_count, aes(x = word, y = n, fill = sentiment)) +
 geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free") +
  labs(y = "Contribution to sentiment", x = NULL) +
  coord_flip() +
  theme_pubclean()

Here we see the top 5 words which contribute the most to each sentiment. For example, the top 5 most contributive words for the sentimet anger are illegal, hate, abuse, violence and fight, in that order

Mean Sentiment Score

Sentiment score for joined dictionaries:

mean(tweets.sen$value)
## [1] 0.3604888

Sentiment score for affin:

mean(affin$value)
## [1] 0.2879153

Looks like the posts about Facebook are more positive than negative.