Set working directory, and load source files (libraries).
# check working directory
getwd()
## [1] "/home/bodong/src/r/twitter-analytics/twitter-hashtag-analytics"
# note that Knitr automatically sets wd to where the Rmd file is if you
# wish to run code line-by-line, you should setwd mannually
# setwd('/home/bodong/src/r/twitter-analytics/twitter-hashtag-analytics')
# load source files
source("get_tweets.R")
source("munge_tweets.R")
source("utilities.R")
Retrieve a Twitter hashtag dataset by search:
# get tweets by search this function is defined in get_tweets.R df <-
# GetTweetsBySearch('#LAK13')
# save or load data (for debugging) save(df, file='./data/df.Rda')
load("./data/df.Rda")
# check names of columns
names(df)
## [1] "text" "favorited" "replyToSN" "created"
## [5] "truncated" "replyToSID" "id" "replyToUID"
## [9] "statusSource" "screenName"
This dataset contains 113 tweets posted by 54 unique Twitter users between 2013-02-18 and 2013-02-25.
Because tweet information retrieved through twitteR are kinda limited (see reference manual, p. 11), we need to extract user information, such as reply_to_user, retweet_from_user, etc. At the same time, the names of metadata in twitteR are quite different from those used in the official Twitter API, the function also renames some attributes of tweet data.
# preprocessing
df <- PreprocessTweets(df)
# structure of df
str(df)
## 'data.frame': 113 obs. of 13 variables:
## $ text : chr "#lak13 RT @anachrobot: Cultural differences affect how network connectivity informs collaboration choice #cscw2013 culture sess"| __truncated__ "@drchuck What is the next MOOC you're teaching? You mentioned it during the LTI session #LAK13" "@drchuck talking at #lak13 about #lti 2.x - getting data back from 3rd party tools - to help feed central/external analytics so"| __truncated__ "RT @gsiemens: In 10 min, @drchuck on Learning Tools Interoperability, here in Collaborate: https://t.co/rD8rtO05XV #lak13 #lear"| __truncated__ ...
## $ favorited : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ replyToSN : logi NA NA NA NA NA NA ...
## $ created_at : POSIXct, format: "2013-02-25 17:09:34" "2013-02-25 16:57:54" ...
## $ truncated : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ replyToSID : chr NA NA NA NA ...
## $ id : chr "306088559197188097" "306085624992120832" "306084573811470336" "306077697187143680" ...
## $ replyToUID : logi NA NA NA NA NA NA ...
## $ statusSource: chr "<a href="http://www.tweetdeck.com">TweetDeck</a>" "<a href="http://twitter.com/">web</a>" "<a href="http://itunes.apple.com/us/app/twitter/id409789998?mt=12">Twitter for Mac</a>" "<a href="http://twitter.com/download/android">Twitter for Android</a>" ...
## $ screen_name : chr "sjgknight" "alexip" "malcolmmurray" "ae_romero" ...
## $ from_user : chr "sjgknight" "alexip" "malcolmmurray" "ae_romero" ...
## $ reply_to : chr NA "drchuck" "drchuck" NA ...
## $ retweet_from: chr NA NA NA "gsiemens" ...
library(ggplot2)
library(reshape2)
# Count tables
countTweets <- GetTweetCountTable(df, "from_user")
countRetweets <- GetTweetCountTable(df, "retweet_from")
countReplies <- GetTweetCountTable(df, "reply_to")
# combine counts into one data frame
counts1 <- merge(countTweets, countRetweets, by = "user", all.x = TRUE)
counts <- merge(counts1, countReplies, by = "user", all.x = TRUE)
colnames(counts) <- c("user", "tweets", "replied_to", "retweeted_by")
counts[is.na(counts)] <- 0
counts$tweets <- as.numeric(counts$tweets)
counts$replied_to <- as.numeric(counts$replied_to)
counts$retweeted_by <- as.numeric(counts$retweeted_by)
# melt data
counts.melt <- melt(counts, id.vars = c("user"))
# plot (Cleveland dot plot)
ggplot(counts.melt, aes(x = user, y = value, color = variable)) + geom_point() +
coord_flip() + ggtitle("Counts of tweets, retweets, and messages") + xlab("Counts") +
ylab("Users")
# create new column 'ratio'
counts$ratio <- counts$retweeted_by/counts$tweets
# plot ratio for users who have at least one rt
ggplot(counts[counts$retweeted_by > 0, ], aes(x = reorder(user, ratio), y = ratio)) +
geom_point() + coord_flip() + ggtitle("Ratio of retweets to tweets") + xlab("Users") +
ylab("Retweets/Tweets ratio")
# load source file first
source("social_analysis.R")
# create data frame
rt.df <- CreateSNADataFrame(df, from = "from_user", to = "retweet_from", linkNames = "rt")
rp.df <- CreateSNADataFrame(df, from = "from_user", to = "reply_to", linkNames = "rp")
# begin social network analysis plotting
require(igraph)
require(sna)
require(Matrix)
require(SparseM)
# create graph data frame (igraph)
g <- graph.data.frame(rt.df, directed = TRUE)
# quick and dirty plot with igraph
plot.igraph(g)
# plot with sna get adjacency matrix
mat <- get.adjacency(g)
# convert to csr matrix provided by SparseM ref:
# http://cos.name/cn/topic/108758
mat.csr <- as.matrix.csr(mat, ncol = ncol(mat))
# plot with sna
gplot(mat.csr)
Density of this network is 0.0263. Reciprocity of users in the network is 0.95. Degree centralization of this network is 0.2287.
These measures are calculated as below.
# density
gden(mat.csr)
## [1] 0.02628
# reciprocity
grecip(mat.csr)
## Mut
## 0.95
# centralization
centralization(mat.csr, sna::degree)
## [1] 0.2287
# density
cug.gden <- cug.test(mat.csr, gden)
plot(cug.gden)
range(cug.gden$rep.stat)
## [1] 0.4641 0.5378
# reciprocity
cug.recip <- cug.test(mat.csr, grecip)
plot(cug.recip)
range(cug.recip$rep.stat)
## [1] 0.4449 0.5641
# transistivity
cug.gtrans <- cug.test(mat.csr, gtrans)
plot(cug.gtrans)
range(cug.gtrans$rep.stat)
## [1] 0.4511 0.5433
# centralisation
cug.cent <- cug.test(mat.csr, centralization, FUN.arg = list(FUN = degree))
plot(cug.cent)
range(cug.cent$rep.stat)
## [1] 0.06545 0.24157
g.wc <- walktrap.community(g, steps = 1000, modularity = TRUE)
# number of communities
length(g.wc)
## [1] 10
# sizes of communities
sizes(g.wc)
## Community sizes
## 1 2 3 4 5 6 7 8 9 10
## 5 8 15 2 2 2 2 2 1 1
# plot
plot(as.dendrogram(g.wc))
We have detected 10 communities in this network. The largest community contains 27.778% of all users.
Make a word cloud.
# load source file first
source("semantic_analysis.R")
# construct corpus, with regular preprocessing performed
corpus <- ConstructCorpus(df$text, removeTags = TRUE, removeUsers = TRUE)
# make a word cloud
MakeWordCloud(corpus)
# create a term document matrix only keep tokens longer than three
# characters
td.mat <- TermDocumentMatrix(corpus, control = list(minWordLength = 3))
# have a quick look at the term document matrix
inspect(td.mat[1:10, 1:10])
## A term-document matrix (10 terms, 10 documents)
##
## Non-/sparse entries: 1/99
## Sparsity : 99%
## Maximal term length: 11
## Weighting : term frequency (tf)
##
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10
## 100 0 0 0 0 0 0 0 0 0 0
## 2pm 0 0 0 0 0 0 0 0 0 0
## 3rd 0 0 1 0 0 0 0 0 0 0
## acad 0 0 0 0 0 0 0 0 0 0
## according 0 0 0 0 0 0 0 0 0 0
## acheivement 0 0 0 0 0 0 0 0 0 0
## activities 0 0 0 0 0 0 0 0 0 0
## activity 0 0 0 0 0 0 0 0 0 0
## adam 0 0 0 0 0 0 0 0 0 0
## advantage 0 0 0 0 0 0 0 0 0 0
# frequent words
findFreqTerms(td.mat, lowfreq = 10)
## [1] "activity" "analytics" "analyzing"
## [4] "canvas" "capturing" "data"
## [7] "discussion" "feedback" "fritz"
## [10] "httpstco8di8qckz" "john" "join"
## [13] "learning" "min" "network"
## [16] "peer" "recipes" "scale"
## [19] "tools" "using"
# find related words of a word
findAssocs(td.mat, "learning", 0.5)
## analytics min feedback fritz join peer scale
## 0.70 0.63 0.56 0.56 0.56 0.56 0.56