library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(highcharter)
## Warning: package 'highcharter' was built under R version 3.3.2
## Highcharts (www.highcharts.com) is a
## Highsoft software product which is
## not free for commercial and Governmental use
library(tm.plugin.webmining)
## Warning: package 'tm.plugin.webmining' was built under R version 3.3.2
##
## Attaching package: 'tm.plugin.webmining'
## The following object is masked from 'package:base':
##
## parse
library(plotly)
## Warning: package 'plotly' was built under R version 3.3.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidytext)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:plotly':
##
## %>%, groups
## The following object is masked from 'package:highcharter':
##
## %>%
## The following object is masked from 'package:stringr':
##
## %>%
## The following objects are masked from 'package:dplyr':
##
## %>%, as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(widyr)
hols<-read.csv("HolidayTweets.csv",stringsAsFactors = F,header = T,quote = "")
text <- unlist(lapply(hols$Metadata,extractHTMLStrip))
text <- gsub("\n","",text)
data_tweets <- data.frame(text=text)
data_tweets$hashtags <- str_extract_all(data_tweets$text, "#\\S+")
data_tweets$num_hashtags <- unlist(lapply((data_tweets$hashtags),length))
data_tweets$clean_text <- gsub("[^[:alnum:] ]", "", data_tweets$text)
data_tweets$number_of_words <- sapply(gregexpr("\\W+", data_tweets$clean_text), length) + 1
data_tweets$picture <- ifelse(str_detect(data_tweets$text, "t.co"),"Picture/link", "No picture/link")
data_tweets$X <- seq(from=1,to=dim(data_tweets)[1],by=1)
The data set given consists of ID information and the HTML metadata. Here we use the extractHTMLStrip() function provided by the tm.plugin.webmining library.Upon extraction, we use several text processing tools to extract other information such as hashtags, number of words and whether pictures/links were used.
ggplot(data_tweets, aes(x=number_of_words)) + geom_histogram(binwidth = 1) + ggtitle("Number of Words Used")
DT::datatable(data_tweets %>% mutate(if_hashtag=ifelse(num_hashtags>0,"Yes","No"))%>%group_by(if_hashtag) %>% summarise(n=median(number_of_words)) )
We estimate that tweets with hashtags have at least 5 times more words used than tweets without hashtags.
data_tweets %>% group_by(picture)%>% summarise(n=n()) %>%
hchart("column",x=picture,y=log(n))
We see that most of the tweets did not contain pictures/links. The plots look comparable because of log scaling.
DT::datatable(data_tweets%>%group_by(picture) %>% summarise(n=median(number_of_words)) )
Tweets with picture/links tend to use more words based on the median estimate
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:igraph':
##
## %>%, crossing
## The following object is masked from 'package:tm.plugin.webmining':
##
## extract
library(RSentiment)
tweet <- data_tweets$clean_text
tweet = gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", " ", tweet)
#retweet
tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", tweet)
# removing hashtags
tweet = gsub("#\\w+", " ", tweet)
# removing @people
tweet = gsub("@\\w+", " ", tweet)
#removing punctuations
tweet = gsub("[[:punct:]]", " ", tweet)
#removing numbers
tweet = gsub("[[:digit:]]", " ", tweet)
#removing emojis
tweet<-str_replace_all(tweet,"[^[:graph:]]"," ")
tweet <- str_replace_all(tweet,'https'," ")
tweet <- str_replace_all(tweet,'amp'," ")
wordstoremove <- c("")
tweet <- sapply(tweet, function(x) gsub(paste(wordstoremove, collapse = '|'), '', x))
# removing non-english characters
#tweet1 <- grep('tweet',iconv(tweet,'latin1','ASCII',sub='tweet'))
data_tweets$clean_text_2 <- tweet
data_tweets$sentiment <- rep("",dim(data_tweets)[1])
for(i in 1:dim(data_tweets)[1])
{
data_tweets$sentiment[i] <- as.character(calculate_sentiment(data_tweets$clean_text_2[i])$sentiment)
}
pi <- data_tweets %>% group_by(sentiment) %>% summarise(n=n()) %>% mutate(Percentage=(n/sum(n))*100)
plot_ly() %>%
add_pie( data= pi,
labels=pi$sentiment,
values = pi$Percentage,
name = "") %>% layout(title = 'Percentage Sentiment in Tweets',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
We see that most of the tweets were neutral in nature, followed by positive and negative ones.
To answer this question, we look into what words make up negative tweets.
words_neg <- subset(data_tweets,sentiment=="Negative"|sentiment=="Very Negative")
temp <- words_neg[,c("clean_text_2","sentiment","X")]
words <- temp %>%
unnest_tokens(word,clean_text_2)%>%
filter(!word %in% stop_words$word,
str_detect(word,"^[a-z']+$"))
for(i in 1:dim(words)[1])
{
words$sentiment[i] <- as.character(calculate_sentiment(words$word[i])$sentiment)
}
words %>% filter(sentiment=="Negative"| sentiment=="Very Negative") %>% group_by(word) %>% summarise(n=n()) %>% arrange(desc(n)) %>% filter(n>10) %>%
hchart("column",x=word,y=n)%>%
hc_xAxis(labels = list(rotation = -90, step = 1))
Words like “emergency”,“shame”,“pig”,“evil”,“bs”,“terrible” come under this category.
ax <- list(
zeroline=FALSE,
showline=FALSE,
showticklabels=FALSE,
showgrid=FALSE
)
temp <- data_tweets %>% group_by(picture,sentiment) %>% summarise(n=n())
temp_pic <- subset(temp,picture=="Picture/link")
temp_no_pic <- subset(temp,picture=="No picture/link")
temp_pic <- temp_pic %>% mutate(percentage=(n/sum(n))*100)
temp_no_pic <- temp_no_pic %>% mutate(percentage=(n/sum(n))*100)
pie_chart_1 <- plot_ly() %>%
add_pie(data = temp_pic,
labels=temp_pic$sentiment,
values = temp_pic$percentage,
name = "By Sentiment",
domain = list(x = c(0.52, 1), y = c(0.5, 1)))%>%
add_pie(data = temp_no_pic,
labels=temp_no_pic$sentiment,
values = temp_no_pic$percentage,
name = "By Sentiment",
domain = list(x = c(0, 0.48), y = c(0.5, 1))) %>%
layout(title = "Picture and No Picture",
xaxis=ax,
yaxis=ax)
pie_chart_1
#pie_chart_2
library(DT)
## Warning: package 'DT' was built under R version 3.3.2
##
## Attaching package: 'DT'
## The following object is masked from 'package:igraph':
##
## %>%
temp <- data_tweets[,c("clean_text_2","sentiment","X")]
words <- temp %>%
unnest_tokens(word,clean_text_2)%>%
filter(!word %in% stop_words$word,
str_detect(word,"^[a-z']+$"))
words %>% group_by(word) %>% summarise(n=n()) %>% filter(n>250) %>% arrange(desc(n)) %>%
hchart("column",x=word,y=log(n))%>%
hc_xAxis(labels = list(rotation = -90, step = 1))
#DT::datatable(words %>% group_by(word) %>% summarise(n=n()) %>% filter(n>100) %>% arrange(desc(n)))