We will analyze over 15k tweets sent by ISIS supporters since the November 2015 Paris Attacks. You can download the .csv data (isis_tweets.csv) from the class folder. The data are also available on kaggle.com.
Compiled based on http://juliasilge.com/blog/Ten-Thousand-Tweets/ and http://juliasilge.com/blog/Joy-to-the-World/
You will likely need to install these libraries:
#install.packages("ggplot2") # a R library for data visualization
#install.packages("lubridate") # a R library to clean up timestamps
#install.packages("tm") #a R library to text analytics
#install.packages("wordcloud")#a R library for generating wordcloud
#install.packages("syuzhet") #a R library for sentiment analysis
#install.packages("dplyr")
#require(devtools)
#install_github('rCharts', 'ramnathv')
#install.packages("leaflet")
Let’s fire up the libraries.
library(ggplot2)
library(lubridate)
library(scales)
library(tm)
library(stringr)
library(wordcloud)
library(syuzhet)
library(reshape2)
library(dplyr)
Let’s load the .csv data.
alltweets <- read.csv("isis_tweets.csv", header = TRUE) #make sure the file is under your current working directory. Use getwd() for current working directory
Convert the timestamps of the tweets to the same time zone
alltweets$created <- mdy_hm(alltweets$created)
alltweets$created <- with_tz(alltweets$created, "America/New_York")
Show twitter volume by month
ggplot(data = alltweets, aes(x = wday(created, label = TRUE))) +
geom_bar(aes(fill = ..count..)) +
theme(legend.position = "none") +
xlab("Month") + ylab("Number of tweets") +
scale_fill_gradient(low = "midnightblue", high = "aquamarine4")
Show twitter volume by weekdays
ggplot(data = alltweets, aes(x = wday(created, label = TRUE))) +
geom_bar(aes(fill = ..count..)) +
theme(legend.position = "none") +
xlab("Day of the Week") + ylab("Number of tweets") +
scale_fill_gradient(low = "midnightblue", high = "aquamarine4")
Show twitter volume by hour
#extract days
alltweets$timeonly <- as.numeric(alltweets$created - trunc(alltweets$created, "days"))
class(alltweets$timeonly) <- "POSIXct"
ggplot(data = alltweets, aes(x = timeonly)) +
geom_histogram(aes(fill = ..count..)) +
theme(legend.position = "none") +
xlab("Time") + ylab("Number of tweets") +
scale_x_datetime(breaks = date_breaks("2 hours"),
labels = date_format("%H:00")) +
scale_fill_gradient(low = "midnightblue", high = "aquamarine4")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Produce a word cloud of frequent terms
#remove special characters, in particular, the symbol @
nohandles <- str_replace_all(alltweets$text, "@\\w+", "")
wordCorpus <- Corpus(VectorSource(nohandles))
wordCorpus <- tm_map(wordCorpus, removePunctuation)
wordCorpus <- tm_map(wordCorpus, content_transformer(tolower)) #converted to lower case
wordCorpus <- tm_map(wordCorpus, removeWords, stopwords("english")) #remove stopwords
wordCorpus <- tm_map(wordCorpus, removeWords, c("amp", "2yo", "3yo", "4yo"))
wordCorpus <- tm_map(wordCorpus, stripWhitespace)
pal <- brewer.pal(9,"YlGnBu")
pal <- pal[-(1:4)]
set.seed(123)
wordcloud(words = wordCorpus, scale=c(10,0.3), max.words=100, random.order=FALSE,
rot.per=0.35, use.r.layout=FALSE, colors=pal)
Produce a word cloud of frequent twitter mentions
friends <- str_extract_all(alltweets$text, "@\\w+")
namesCorpus <- Corpus(VectorSource(friends))
set.seed(146)
wordcloud(words = namesCorpus, scale=c(7,0.5), max.words=40, random.order=FALSE,
rot.per=0.10, use.r.layout=FALSE, colors=pal)
Generate sentiment score for each tweet
alltweets$clean_text <- str_replace_all(alltweets$text, "@\\w+", "")
Sentiment <- get_nrc_sentiment(alltweets$clean_text)
alltweets_senti <- cbind(alltweets, Sentiment)
sentimentTotals <- data.frame(colSums(alltweets_senti[,c(11:18)]))
names(sentimentTotals) <- "count"
sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals), sentimentTotals)
rownames(sentimentTotals) <- NULL
Twitter volume in each sentiment category
ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) +
geom_bar(aes(fill = sentiment), stat = "identity") +
theme(legend.position = "none") +
xlab("Sentiment") + ylab("Total Count") + ggtitle("Total Sentiment Score for All Tweets")
Positive and negative sentiment over time
posnegtime <- alltweets_senti %>%
group_by(created = cut(created, breaks="2 days")) %>%
summarise(negative = mean(negative),
positive = mean(positive)) %>% melt
## Using created as id variables
names(posnegtime) <- c("timestamp", "sentiment", "meanvalue")
posnegtime$sentiment = factor(posnegtime$sentiment,levels(posnegtime$sentiment)[c(2,1)])
ggplot(data = posnegtime, aes(x = as.Date(timestamp), y = meanvalue, group = sentiment)) +
geom_line(size = 1.5, alpha = 0.7, aes(color = sentiment)) +
geom_point(size = 0.3) +
ylim(0, NA) +
scale_colour_manual(values = c("springgreen4", "firebrick3")) +
theme(legend.title=element_blank(), axis.title.x = element_blank()) +
scale_x_date(breaks = date_breaks("3 months"),
labels = date_format("%Y-%b")) +
ylab("Average sentiment score") +
ggtitle("Sentiment Over Time")
Various sentiment type over time
alltweets_senti$month <- month(alltweets_senti$created, label = TRUE)
monthlysentiment <- alltweets_senti %>% group_by(month) %>%
summarise(anger = mean(anger),
anticipation = mean(anticipation),
disgust = mean(disgust),
fear = mean(fear),
joy = mean(joy),
sadness = mean(sadness),
surprise = mean(surprise),
trust = mean(trust)) %>% melt
## Using month as id variables
names(monthlysentiment) <- c("month", "sentiment", "meanvalue")
ggplot(data = monthlysentiment, aes(x = month, y = meanvalue, group = sentiment)) +
geom_line(size = 2.5, alpha = 0.7, aes(color = sentiment)) +
geom_point(size = 0.5) +
ylim(0, NA) +
theme(legend.title=element_blank(), axis.title.x = element_blank()) +
ylab("Average sentiment score") +
ggtitle("Sentiment During the Year")