We will analyze over 15k tweets sent by ISIS supporters since the November 2015 Paris Attacks. You can download the .csv data (isis_tweets.csv) from the class folder. The data are also available on kaggle.com.

Compiled based on http://juliasilge.com/blog/Ten-Thousand-Tweets/ and http://juliasilge.com/blog/Joy-to-the-World/

You will likely need to install these libraries:

#install.packages("ggplot2") # a R library for data visualization
#install.packages("lubridate") # a R library to clean up timestamps 
#install.packages("tm") #a R library to text analytics
#install.packages("wordcloud")#a R library for generating wordcloud
#install.packages("syuzhet") #a R library for sentiment analysis
#install.packages("dplyr")

#require(devtools)
#install_github('rCharts', 'ramnathv')
#install.packages("leaflet")

Let’s fire up the libraries.

library(ggplot2)
library(lubridate)
library(scales)
library(tm)
library(stringr)
library(wordcloud)
library(syuzhet)
library(reshape2)
library(dplyr)

Let’s load the .csv data.

alltweets <- read.csv("isis_tweets.csv", header = TRUE) #make sure the file is under your current working directory. Use getwd() for current working directory

Convert the timestamps of the tweets to the same time zone

alltweets$created <- mdy_hm(alltweets$created)
alltweets$created <- with_tz(alltweets$created, "America/New_York")

Show twitter volume by month

ggplot(data = alltweets, aes(x = wday(created, label = TRUE))) +
  geom_bar(aes(fill = ..count..)) +
  theme(legend.position = "none") +
  xlab("Month") + ylab("Number of tweets") +
  scale_fill_gradient(low = "midnightblue", high = "aquamarine4")

Show twitter volume by weekdays

ggplot(data = alltweets, aes(x = wday(created, label = TRUE))) +
  geom_bar(aes(fill = ..count..)) +
  theme(legend.position = "none") +
  xlab("Day of the Week") + ylab("Number of tweets") + 
  scale_fill_gradient(low = "midnightblue", high = "aquamarine4")

Show twitter volume by hour

#extract days
alltweets$timeonly <- as.numeric(alltweets$created - trunc(alltweets$created, "days"))
class(alltweets$timeonly) <- "POSIXct"

ggplot(data = alltweets, aes(x = timeonly)) +
  geom_histogram(aes(fill = ..count..)) +
  theme(legend.position = "none") +
  xlab("Time") + ylab("Number of tweets") + 
  scale_x_datetime(breaks = date_breaks("2 hours"), 
                   labels = date_format("%H:00")) +
  scale_fill_gradient(low = "midnightblue", high = "aquamarine4")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Produce a word cloud of frequent terms

#remove special characters, in particular, the symbol @
nohandles <- str_replace_all(alltweets$text, "@\\w+", "") 
wordCorpus <- Corpus(VectorSource(nohandles)) 
wordCorpus <- tm_map(wordCorpus, removePunctuation) 
wordCorpus <- tm_map(wordCorpus, content_transformer(tolower)) #converted to lower case
wordCorpus <- tm_map(wordCorpus, removeWords, stopwords("english")) #remove stopwords
wordCorpus <- tm_map(wordCorpus, removeWords, c("amp", "2yo", "3yo", "4yo"))
wordCorpus <- tm_map(wordCorpus, stripWhitespace)

pal <- brewer.pal(9,"YlGnBu")
pal <- pal[-(1:4)]
set.seed(123)
wordcloud(words = wordCorpus, scale=c(10,0.3), max.words=100, random.order=FALSE, 
          rot.per=0.35, use.r.layout=FALSE, colors=pal)

Produce a word cloud of frequent twitter mentions

friends <- str_extract_all(alltweets$text, "@\\w+")
namesCorpus <- Corpus(VectorSource(friends))

set.seed(146)
wordcloud(words = namesCorpus, scale=c(7,0.5), max.words=40, random.order=FALSE, 
          rot.per=0.10, use.r.layout=FALSE, colors=pal)

Generate sentiment score for each tweet

alltweets$clean_text <- str_replace_all(alltweets$text, "@\\w+", "")
Sentiment <- get_nrc_sentiment(alltweets$clean_text)
alltweets_senti <- cbind(alltweets, Sentiment)

sentimentTotals <- data.frame(colSums(alltweets_senti[,c(11:18)]))
names(sentimentTotals) <- "count"
sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals), sentimentTotals)
rownames(sentimentTotals) <- NULL

Twitter volume in each sentiment category

ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Total Count") + ggtitle("Total Sentiment Score for All Tweets")

Positive and negative sentiment over time

posnegtime <- alltweets_senti %>% 
  group_by(created = cut(created, breaks="2 days")) %>%
  summarise(negative = mean(negative),
            positive = mean(positive)) %>% melt
## Using created as id variables
names(posnegtime) <- c("timestamp", "sentiment", "meanvalue")
posnegtime$sentiment = factor(posnegtime$sentiment,levels(posnegtime$sentiment)[c(2,1)])

ggplot(data = posnegtime, aes(x = as.Date(timestamp), y = meanvalue, group = sentiment)) +
  geom_line(size = 1.5, alpha = 0.7, aes(color = sentiment)) +
  geom_point(size = 0.3) +
  ylim(0, NA) + 
  scale_colour_manual(values = c("springgreen4", "firebrick3")) +
  theme(legend.title=element_blank(), axis.title.x = element_blank()) +
  scale_x_date(breaks = date_breaks("3 months"), 
               labels = date_format("%Y-%b")) +
  ylab("Average sentiment score") + 
  ggtitle("Sentiment Over Time")

Various sentiment type over time

alltweets_senti$month <- month(alltweets_senti$created, label = TRUE)
monthlysentiment <- alltweets_senti %>% group_by(month) %>% 
  summarise(anger = mean(anger), 
            anticipation = mean(anticipation), 
            disgust = mean(disgust), 
            fear = mean(fear), 
            joy = mean(joy), 
            sadness = mean(sadness), 
            surprise = mean(surprise), 
            trust = mean(trust)) %>% melt
## Using month as id variables
names(monthlysentiment) <- c("month", "sentiment", "meanvalue")

ggplot(data = monthlysentiment, aes(x = month, y = meanvalue, group = sentiment)) +
  geom_line(size = 2.5, alpha = 0.7, aes(color = sentiment)) +
  geom_point(size = 0.5) +
  ylim(0, NA) +
  theme(legend.title=element_blank(), axis.title.x = element_blank()) +
  ylab("Average sentiment score") + 
  ggtitle("Sentiment During the Year")