Introduction

library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(highcharter)
## Warning: package 'highcharter' was built under R version 3.3.2
## Highcharts (www.highcharts.com) is a
## Highsoft software product which is
## not free for commercial and Governmental use
library(tm.plugin.webmining)
## Warning: package 'tm.plugin.webmining' was built under R version 3.3.2
## 
## Attaching package: 'tm.plugin.webmining'
## The following object is masked from 'package:base':
## 
##     parse
library(plotly)
## Warning: package 'plotly' was built under R version 3.3.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidytext)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:plotly':
## 
##     %>%, groups
## The following object is masked from 'package:highcharter':
## 
##     %>%
## The following object is masked from 'package:stringr':
## 
##     %>%
## The following objects are masked from 'package:dplyr':
## 
##     %>%, as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(widyr)
hols<-read.csv("HolidayTweets.csv",stringsAsFactors = F,header = T,quote = "")

text <- unlist(lapply(hols$Metadata,extractHTMLStrip))
text <- gsub("\n","",text)
data_tweets <- data.frame(text=text)
data_tweets$hashtags <- str_extract_all(data_tweets$text, "#\\S+")
data_tweets$num_hashtags <- unlist(lapply((data_tweets$hashtags),length))
data_tweets$clean_text <- gsub("[^[:alnum:] ]", "", data_tweets$text)
data_tweets$number_of_words <- sapply(gregexpr("\\W+", data_tweets$clean_text), length) + 1

data_tweets$picture <- ifelse(str_detect(data_tweets$text, "t.co"),"Picture/link", "No picture/link")

data_tweets$X <- seq(from=1,to=dim(data_tweets)[1],by=1)

The data set given consists of ID information and the HTML metadata. Here we use the extractHTMLStrip() function provided by the tm.plugin.webmining library.Upon extraction, we use several text processing tools to extract other information such as hashtags, number of words and whether pictures/links were used.

Hashtags

What were the most commonly used Hashtags?

hash <- unlist(data_tweets$hashtags)
hash <- data.frame(i=1:length(hash),text=hash)
hash$text <- tolower(hash$text)
hash %>% group_by(text) %>% summarise(n=n()) %>% arrange(desc(n)) %>% filter(n>150) %>%
  hchart("column",x=text,y=log(n)) %>%
  hc_xAxis(labels = list(rotation = -90, step = 1))

Naturally, we would expect the most popular hashtag to be Christmas related(naturally). We also see #iphone, #selfie, #giftcard,#blackfriday and #shoes amongst other things.Christmas shopping hit an all time high in 2016, with overall spending in the US crossing a trillion dollars. We also see mentions of other festivals during the season , such as #hanukkah.

Distribution of Number of Hashtags used

ggplot(data=data_tweets,aes(x=num_hashtags))+geom_histogram(binwidth = 1)

Number of Words Used

Distribution of Number of Words Used

ggplot(data_tweets, aes(x=number_of_words)) + geom_histogram(binwidth = 1) + ggtitle("Number of Words Used")

Distribution of Number of Words used with and without hashtags

data_tweets %>% mutate(if_hashtag=ifelse(num_hashtags>0,"Yes","No")) %>%
  ggplot(aes(x=number_of_words, fill=if_hashtag)) +geom_histogram(position="identity", alpha=0.4,binwidth =0.5)

We see that both distributions are skewed which leads us to use the median statistic to encapsulate the number of words for both of these distributions.

Medians

DT::datatable(data_tweets %>% mutate(if_hashtag=ifelse(num_hashtags>0,"Yes","No"))%>%group_by(if_hashtag) %>% summarise(n=median(number_of_words)) )

We estimate that tweets with hashtags have at least 5 times more words used than tweets without hashtags.

Sentiments

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:igraph':
## 
##     %>%, crossing
## The following object is masked from 'package:tm.plugin.webmining':
## 
##     extract
library(RSentiment)
tweet <- data_tweets$clean_text
tweet = gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", " ", tweet)
#retweet
tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", tweet)
# removing hashtags
tweet = gsub("#\\w+", " ", tweet)
# removing @people
tweet = gsub("@\\w+", " ", tweet)
#removing punctuations
tweet = gsub("[[:punct:]]", " ", tweet)
#removing numbers
tweet = gsub("[[:digit:]]", " ", tweet)
#removing emojis
tweet<-str_replace_all(tweet,"[^[:graph:]]"," ")
tweet <- str_replace_all(tweet,'https'," ")
tweet <- str_replace_all(tweet,'amp'," ")
wordstoremove <- c("")
tweet <- sapply(tweet, function(x) gsub(paste(wordstoremove, collapse = '|'), '', x))
# removing non-english characters
#tweet1 <- grep('tweet',iconv(tweet,'latin1','ASCII',sub='tweet'))
data_tweets$clean_text_2 <- tweet



data_tweets$sentiment <- rep("",dim(data_tweets)[1])

for(i in 1:dim(data_tweets)[1])
{
  data_tweets$sentiment[i] <- as.character(calculate_sentiment(data_tweets$clean_text_2[i])$sentiment)
}


pi <- data_tweets %>% group_by(sentiment) %>% summarise(n=n()) %>% mutate(Percentage=(n/sum(n))*100) 
plot_ly() %>%
  add_pie( data= pi,
          labels=pi$sentiment,
          values = pi$Percentage,
          name = "") %>% layout(title = 'Percentage Sentiment in Tweets',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

We see that most of the tweets were neutral in nature, followed by positive and negative ones.

Why are there negative tweets?

To answer this question, we look into what words make up negative tweets.

words_neg <- subset(data_tweets,sentiment=="Negative"|sentiment=="Very Negative")
temp <- words_neg[,c("clean_text_2","sentiment","X")]

words <- temp %>%
  unnest_tokens(word,clean_text_2)%>%
  filter(!word %in% stop_words$word,
         str_detect(word,"^[a-z']+$"))

for(i in 1:dim(words)[1])
{
  words$sentiment[i] <- as.character(calculate_sentiment(words$word[i])$sentiment)
}

words %>% filter(sentiment=="Negative"| sentiment=="Very Negative") %>% group_by(word) %>% summarise(n=n()) %>% arrange(desc(n)) %>% filter(n>10) %>%
  hchart("column",x=word,y=n)%>%
  hc_xAxis(labels = list(rotation = -90, step = 1))

Words like “emergency”,“shame”,“pig”,“evil”,“bs”,“terrible” come under this category.

Most Common Words Used

library(DT)
## Warning: package 'DT' was built under R version 3.3.2
## 
## Attaching package: 'DT'
## The following object is masked from 'package:igraph':
## 
##     %>%
temp <- data_tweets[,c("clean_text_2","sentiment","X")]



words <- temp %>%
  unnest_tokens(word,clean_text_2)%>%
  filter(!word %in% stop_words$word,
         str_detect(word,"^[a-z']+$"))

words %>% group_by(word) %>% summarise(n=n()) %>% filter(n>250) %>% arrange(desc(n)) %>%
  hchart("column",x=word,y=log(n))%>%
  hc_xAxis(labels = list(rotation = -90, step = 1))
#DT::datatable(words %>% group_by(word) %>% summarise(n=n()) %>% filter(n>100) %>% arrange(desc(n)))

Correlated Terms

words_counts <- words %>%
  count(word) %>%
  filter(n >= 30)

words_correlations <- words %>%
  semi_join(words_counts) %>%
  pairwise_cor(word,X,sort = TRUE, upper = FALSE)
## Joining, by = "word"
words_correlations <- filter(words_correlations,correlation>0.7)
words_correlations <- as.data.frame(words_correlations)
graph <- make_graph(t(words_correlations[,c(1:2)]),directed = F)
E(graph)$weight <- words_correlations$correlation
V(graph)$label.cex <- 1.0

plot(graph,edge.width=E(graph)$weight,layout=layout.fruchterman.reingold,vertex.size = 5)

library(networkD3)
## 
## Attaching package: 'networkD3'
## The following object is masked from 'package:DT':
## 
##     JS
## The following object is masked from 'package:highcharter':
## 
##     JS
graph <- as.data.frame(get.edgelist(graph))
#g <- unique(g)
simpleNetwork(graph,charge = -200 , opacity = 0.6, zoom = T, fontSize = 15)
#DT::datatable(words_correlations)