Demo of using twitter-hashtag-analytics package to analyze tweets

1. Data preparation

Set working directory, and load source files (libraries).

# check working directory
getwd()
## [1] "/home/bodong/src/r/twitter-analytics/twitter-hashtag-analytics"

# note that Knitr automatically sets wd to where the Rmd file is if you
# wish to run code line-by-line, you should setwd mannually
# setwd('/home/bodong/src/r/twitter-analytics/twitter-hashtag-analytics')

# load source files
source("get_tweets.R")
source("munge_tweets.R")
source("utilities.R")

Retrieve a Twitter hashtag dataset by search:

# get tweets by search this function is defined in get_tweets.R df <-
# GetTweetsBySearch('#LAK13')

# save or load data (for debugging) save(df, file='./data/df.Rda')
load("./data/df.Rda")

# check names of columns
names(df)
##  [1] "text"         "favorited"    "replyToSN"    "created"     
##  [5] "truncated"    "replyToSID"   "id"           "replyToUID"  
##  [9] "statusSource" "screenName"

This dataset contains 113 tweets posted by 54 unique Twitter users between 2013-02-18 and 2013-02-25.

Because tweet information retrieved through twitteR are kinda limited (see reference manual, p. 11), we need to extract user information, such as reply_to_user, retweet_from_user, etc. At the same time, the names of metadata in twitteR are quite different from those used in the official Twitter API, the function also renames some attributes of tweet data.

# preprocessing
df <- PreprocessTweets(df)

# structure of df
str(df)
## 'data.frame':    113 obs. of  13 variables:
##  $ text        : chr  "#lak13 RT @anachrobot: Cultural differences affect how network connectivity informs collaboration choice #cscw2013 culture sess"| __truncated__ "@drchuck What is the next MOOC you're teaching? You mentioned it during the LTI session #LAK13" "@drchuck talking at #lak13 about #lti 2.x - getting data back from 3rd party tools - to help feed central/external analytics so"| __truncated__ "RT @gsiemens: In 10 min, @drchuck on Learning Tools Interoperability, here in Collaborate: https://t.co/rD8rtO05XV #lak13 #lear"| __truncated__ ...
##  $ favorited   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSN   : logi  NA NA NA NA NA NA ...
##  $ created_at  : POSIXct, format: "2013-02-25 17:09:34" "2013-02-25 16:57:54" ...
##  $ truncated   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSID  : chr  NA NA NA NA ...
##  $ id          : chr  "306088559197188097" "306085624992120832" "306084573811470336" "306077697187143680" ...
##  $ replyToUID  : logi  NA NA NA NA NA NA ...
##  $ statusSource: chr  "&lt;a href=&quot;http://www.tweetdeck.com&quot;&gt;TweetDeck&lt;/a&gt;" "&lt;a href=&quot;http://twitter.com/&quot;&gt;web&lt;/a&gt;" "&lt;a href=&quot;http://itunes.apple.com/us/app/twitter/id409789998?mt=12&quot;&gt;Twitter for Mac&lt;/a&gt;" "&lt;a href=&quot;http://twitter.com/download/android&quot;&gt;Twitter for Android&lt;/a&gt;" ...
##  $ screen_name : chr  "sjgknight" "alexip" "malcolmmurray" "ae_romero" ...
##  $ from_user   : chr  "sjgknight" "alexip" "malcolmmurray" "ae_romero" ...
##  $ reply_to    : chr  NA "drchuck" "drchuck" NA ...
##  $ retweet_from: chr  NA NA NA "gsiemens" ...

2. Count things

Count tweets, retweets (by), and received messages for each user

library(ggplot2)
library(reshape2)

# Count tables
countTweets <- GetTweetCountTable(df, "from_user")
countRetweets <- GetTweetCountTable(df, "retweet_from")
countReplies <- GetTweetCountTable(df, "reply_to")

# combine counts into one data frame
counts1 <- merge(countTweets, countRetweets, by = "user", all.x = TRUE)
counts <- merge(counts1, countReplies, by = "user", all.x = TRUE)
colnames(counts) <- c("user", "tweets", "replied_to", "retweeted_by")
counts[is.na(counts)] <- 0
counts$tweets <- as.numeric(counts$tweets)
counts$replied_to <- as.numeric(counts$replied_to)
counts$retweeted_by <- as.numeric(counts$retweeted_by)

# melt data
counts.melt <- melt(counts, id.vars = c("user"))

# plot (Cleveland dot plot)
ggplot(counts.melt, aes(x = user, y = value, color = variable)) + geom_point() + 
    coord_flip() + ggtitle("Counts of tweets, retweets, and messages") + xlab("Counts") + 
    ylab("Users")

plot of chunk counttables

Ratio of retweets to tweets

# create new column 'ratio'
counts$ratio <- counts$retweeted_by/counts$tweets

# plot ratio for users who have at least one rt
ggplot(counts[counts$retweeted_by > 0, ], aes(x = reorder(user, ratio), y = ratio)) + 
    geom_point() + coord_flip() + ggtitle("Ratio of retweets to tweets") + xlab("Users") + 
    ylab("Retweets/Tweets ratio")

plot of chunk ratio

Social Network Analysis

Plot social network

# load source file first
source("social_analysis.R")

# create data frame
rt.df <- CreateSNADataFrame(df, from = "from_user", to = "retweet_from", linkNames = "rt")
rp.df <- CreateSNADataFrame(df, from = "from_user", to = "reply_to", linkNames = "rp")

# begin social network analysis plotting
require(igraph)
require(sna)
require(Matrix)
require(SparseM)

# create graph data frame (igraph)
g <- graph.data.frame(rt.df, directed = TRUE)

# quick and dirty plot with igraph
plot.igraph(g)

plot of chunk sna


# plot with sna get adjacency matrix
mat <- get.adjacency(g)
# convert to csr matrix provided by SparseM ref:
# http://cos.name/cn/topic/108758
mat.csr <- as.matrix.csr(mat, ncol = ncol(mat))

# plot with sna
gplot(mat.csr)

plot of chunk sna

Basic SNA measures

Density of this network is 0.0263. Reciprocity of users in the network is 0.95. Degree centralization of this network is 0.2287.

These measures are calculated as below.

# density
gden(mat.csr)
## [1] 0.02628

# reciprocity
grecip(mat.csr)
##  Mut 
## 0.95

# centralization
centralization(mat.csr, sna::degree)
## [1] 0.2287

Univariate Conditional Uniform Graph Tests

# density
cug.gden <- cug.test(mat.csr, gden)
plot(cug.gden)

plot of chunk cug

range(cug.gden$rep.stat)
## [1] 0.4641 0.5378

# reciprocity
cug.recip <- cug.test(mat.csr, grecip)
plot(cug.recip)

plot of chunk cug

range(cug.recip$rep.stat)
## [1] 0.4449 0.5641

# transistivity
cug.gtrans <- cug.test(mat.csr, gtrans)
plot(cug.gtrans)

plot of chunk cug

range(cug.gtrans$rep.stat)
## [1] 0.4511 0.5433

# centralisation
cug.cent <- cug.test(mat.csr, centralization, FUN.arg = list(FUN = degree))
plot(cug.cent)

plot of chunk cug

range(cug.cent$rep.stat)
## [1] 0.06545 0.24157

Community detection

g.wc <- walktrap.community(g, steps = 1000, modularity = TRUE)

# number of communities
length(g.wc)
## [1] 10
# sizes of communities
sizes(g.wc)
## Community sizes
##  1  2  3  4  5  6  7  8  9 10 
##  5  8 15  2  2  2  2  2  1  1
# plot
plot(as.dendrogram(g.wc))

plot of chunk detectcommunity

We have detected 10 communities in this network. The largest community contains 27.778% of all users.

Semantic Analysis

Words

Make a word cloud.

# load source file first
source("semantic_analysis.R")

# construct corpus, with regular preprocessing performed
corpus <- ConstructCorpus(df$text, removeTags = TRUE, removeUsers = TRUE)

# make a word cloud
MakeWordCloud(corpus)

plot of chunk wordcloud

# create a term document matrix only keep tokens longer than three
# characters
td.mat <- TermDocumentMatrix(corpus, control = list(minWordLength = 3))

# have a quick look at the term document matrix
inspect(td.mat[1:10, 1:10])
## A term-document matrix (10 terms, 10 documents)
## 
## Non-/sparse entries: 1/99
## Sparsity           : 99%
## Maximal term length: 11 
## Weighting          : term frequency (tf)
## 
##              Docs
## Terms         1 2 3 4 5 6 7 8 9 10
##   100         0 0 0 0 0 0 0 0 0  0
##   2pm         0 0 0 0 0 0 0 0 0  0
##   3rd         0 0 1 0 0 0 0 0 0  0
##   acad        0 0 0 0 0 0 0 0 0  0
##   according   0 0 0 0 0 0 0 0 0  0
##   acheivement 0 0 0 0 0 0 0 0 0  0
##   activities  0 0 0 0 0 0 0 0 0  0
##   activity    0 0 0 0 0 0 0 0 0  0
##   adam        0 0 0 0 0 0 0 0 0  0
##   advantage   0 0 0 0 0 0 0 0 0  0

# frequent words
findFreqTerms(td.mat, lowfreq = 10)
##  [1] "activity"         "analytics"        "analyzing"       
##  [4] "canvas"           "capturing"        "data"            
##  [7] "discussion"       "feedback"         "fritz"           
## [10] "httpstco8di8qckz" "john"             "join"            
## [13] "learning"         "min"              "network"         
## [16] "peer"             "recipes"          "scale"           
## [19] "tools"            "using"

# find related words of a word
findAssocs(td.mat, "learning", 0.5)
## analytics       min  feedback     fritz      join      peer     scale 
##      0.70      0.63      0.56      0.56      0.56      0.56      0.56

Sentiment Analysis

Latent Semantic Analysis