A graphical report on a search for up to 1500 recent tweets tagged #RNchat.
First, who is being RTd, and how often were they RTd in the sample?
[Disable output with r opts_chunk$set(echo=FALSE, message=FALSE) in single backtick quotes]
Let's start by seeing who's been tweeting most amongst the sampled tweets…
require(googleVis)
## Loading required package: googleVis
## Loading required package: RJSONIO
## Welcome to googleVis version 0.2.16
##
## Please read the Google API Terms of Use before you use the package:
## http://code.google.com/apis/terms/index.html
##
## Type ?googleVis to access the overall documentation and
## vignette('googleVis') for the package vignette. You can execute a demo of
## the package via: demo(googleVis)
##
## More information is available on the googleVis project web-site:
## http://code.google.com/p/google-motion-charts-with-r/
##
## Contact: <rvisualisation@gmail.com>
##
## To suppress the this message use:
## suppressPackageStartupMessages(library(googleVis))
require(stringr)
## Loading required package: stringr
require(twitteR)
## Loading required package: twitteR
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: rjson
## Attaching package: 'rjson'
## The following object(s) are masked from 'package:RJSONIO':
##
## fromJSON, toJSON
# The original example used the twitteR library to pull in a user stream
# rdmTweets <- userTimeline('psychemedia', n=100) Instead, I'm going to
# pull in a search around a hashtag.
fstub = "RNchat"
searchTerm = paste("#", fstub, sep = "")
rdmTweets <- searchTwitter(searchTerm, n = 1500)
tw.df = twListToDF(rdmTweets)
tw.df$from_user = tw.df$screenName
# Note that the Twitter search API only goes back 1500 tweets (I think?)
trim <- function(x) sub("@", "", x)
twParse = function(df) {
# Parsing @ messages
df$to = sapply(df$text, function(tweet) trim(str_extract(tweet, "^(@[[:alnum:]_]*)")))
# Parsing RT: messages The str_match approach is really slow - I'm using
# it here rather than str_extract purely as a demo
df$rtof = sapply(df$text, function(tweet) trim(str_match(tweet, "^RT (@[[:alnum:]_]*)")[2]))
# Parsing RT: senders
df$rtby = paste(df$rtof, df$from_user)
df$rtby = sapply(df$rtby, function(dfx) if (word(dfx, 1) == "NA")
NA else word(dfx, 2))
return(df)
}
df.data = twParse(tw.df)
twCounts = function(df) {
# Counting @'d users
to.count = data.frame(table(df$to))
colnames(to.count) = c("Name", "toCount")
# Counting senders
from.count = data.frame(table(df$from_user))
colnames(from.count) = c("Name", "fromCount")
# Counting rtof users
rtof.count = data.frame(table(df$rtof))
colnames(rtof.count) = c("Name", "rtofCount")
# Counting rtby users
rtby.count = data.frame(table(df$rtby))
colnames(rtby.count) = c("Name", "rtbyCount")
# Merging datasets
tmp = merge(rtof.count, to.count, all = TRUE)
tmp = merge(tmp, rtby.count, all = TRUE)
tmp = merge(tmp, from.count, all = TRUE)
tmp$Name = factor(tmp$Name)
return(tmp)
}
df.counts = twCounts(df.data)
# Order factors for display in an ordered bar chart
barsorter = function(dfc) {
htable = table(dfc)
hlevels = names(htable)[order(htable)]
return(factor(dfc, levels = hlevels))
}
require(ggplot2)
## Loading required package: ggplot2
## Find out what's changed in ggplot2 with news(Version == "0.9.1", package =
## "ggplot2")
df.data$frm = barsorter(df.data$from_user)
p = ggplot() + geom_bar(aes(x = na.omit(df.data$frm))) + opts(axis.text.x = theme_text(angle = -90)) +
xlab(NULL)
print(p)
And who's been RTd most:
df.data$hrt = barsorter(df.data$rtof)
p = ggplot() + geom_bar(aes(x = na.omit(df.data$hrt))) + opts(axis.text.x = theme_text(angle = -90)) +
xlab(NULL)
print(p)
It's easy to add in Google Chart component sortable tables:
gTable <- gvisTable(df.counts, options = list(width = 600, height = 300, page = "enable"))
print(gTable, "chart")
require(xtable)
## Loading required package: xtable
require(plyr)
## Loading required package: plyr
## Attaching package: 'plyr'
## The following object(s) are masked from 'package:twitteR':
##
## id
rtof.table = xtable(head(arrange(df.counts, desc(rtofCount), desc(fromCount)),
10), type = html, caption = "Top ten users by 'RT of' and 'from' count",
caption.placement = "top")
Start off with some simple summary tables of who's been tweeting, RTd, etc.
print(rtof.table,'html')
| Name | rtofCount | toCount | rtbyCount | fromCount | |
|---|---|---|---|---|---|
| 1 | NurseTogether | 7 | 1 | 10 | |
| 2 | EllenRichter | 7 | 1 | 4 | |
| 3 | rdjfraser | 3 | 1 | ||
| 4 | RNchat | 2 | 2 | 5 | |
| 5 | theyoganurse | 2 | 4 | ||
| 6 | dlschermd | 2 | 1 | ||
| 7 | bestnursedegree | 1 | 6 | ||
| 8 | bobriddelllive | 1 | 4 | ||
| 9 | icorluvs2write | 1 | 1 | 3 | |
| 10 | amyrnbsn | 1 | 2 |
df.counts=df.counts[,c(1,4,2,3,5)]
x.table=xtable(head(arrange(df.counts,desc(rtbyCount),desc(fromCount)),10), caption = "Top ten users by 'RT by'' count",caption.placement = "top")
print(x.table,'html')
| Name | rtbyCount | rtofCount | toCount | fromCount | |
|---|---|---|---|---|---|
| 1 | NurseTogether | 1 | 7 | 10 | |
| 2 | ElinSilveous | 1 | 6 | ||
| 3 | icorluvs2write | 1 | 1 | 3 | |
| 4 | WantNews | 1 | 3 | ||
| 5 | ShelleyWebbRN | 1 | 2 | ||
| 6 | DrPam | 1 | 1 | 1 | |
| 7 | TheNerdyNurse | 1 | 1 | 1 | |
| 8 | 1ONLY1QUEEN | 1 | 1 | ||
| 9 | Aggiern95 | 1 | 1 | ||
| 10 | B_Yakky | 1 | 1 |
df.counts=df.counts[,c(1,5,2:4)]
print(xtable(head(arrange(df.counts,desc(fromCount),desc(rtofCount)),10), caption = "Top ten users by 'from'' count",caption.placement = "top"),'html')
| Name | fromCount | rtbyCount | rtofCount | toCount | |
|---|---|---|---|---|---|
| 1 | nursefriendly | 18 | 6 | ||
| 2 | NurseTogether | 10 | 1 | 7 | |
| 3 | bestnursedegree | 6 | 1 | ||
| 4 | ElinSilveous | 6 | 1 | ||
| 5 | RNchat | 5 | 2 | 2 | |
| 6 | ShahinaLakhani | 5 | |||
| 7 | EllenRichter | 4 | 7 | 1 | |
| 8 | theyoganurse | 4 | 2 | ||
| 9 | bobriddelllive | 4 | 1 | ||
| 10 | icorluvs2write | 3 | 1 | 1 |
Now lets try an accession plot (based on an oriiginal idea by @mediaczar)
tw.dfx = ddply(df.data, .var = "screenName", .fun = function(x) {
return(subset(x, created %in% min(created), select = c(screenName, created)))
})
## 2) arrange the users in accession order
tw.dfxa = arrange(tw.dfx, -desc(created))
## 3) Use the username accession order to order the screenName factors in
## the searchlist
df.data$screenName = factor(df.data$screenName, levels = tw.dfxa$screenName)
# ggplot seems to be able to cope with time typed values...
p = ggplot(df.data) + geom_point(aes(x = created, y = screenName))
p = p + opts(axis.text.y = theme_text()) + ylab(NULL) + xlab(NULL)
print(p)
The accession plot shows the accession of folk using the search term in the tweet sample, and each of their sampled tweets thereafter.
We can add value to the chart by colouring tweets to see which were original tweets and which were RTs.
df.data$rtt = sapply(df.data$rtof, function(rt) if (is.na(rt)) "T" else "RT")
p = ggplot(df.data) + geom_point(aes(x = created, y = screenName,
col = rtt))
p = p + opts(axis.text.y = theme_text()) + xlab(NULL) + ylab(NULL)
print(p)
We can also limit the chart to only show original tweets:
p = ggplot(subset(df.data, rtt == "T")) + geom_point(aes(x = created,
y = screenName, col = rtt), colour = "aquamarine3")
p = p + opts(axis.text.y = theme_text()) + xlab(NULL) + ylab(NULL)
print(p)
Or only show RTs:
p = ggplot(subset(df.data, rtt == "RT")) + geom_point(aes(x = created,
y = screenName), colour = "red")
p = p + opts(axis.text.y = theme_text()) + xlab(NULL) + ylab(NULL)
print(p)
RemoveAtPeople <- function(tweet) {
gsub("@\\w+", "", tweet)
}
tweets <- as.vector(sapply(df.data$text, RemoveAtPeople))
require(tm)
## Loading required package: tm
generateCorpus = function(df, my.stopwords = c()) {
# Install the textmining library
tw.corpus = Corpus(VectorSource(df))
# remove punctuation I wonder if it would make sense to remove @d names
# first?
tw.corpus = tm_map(tw.corpus, removePunctuation)
# normalise case
tw.corpus = tm_map(tw.corpus, tolower)
# remove stopwords
tw.corpus = tm_map(tw.corpus, removeWords, stopwords("english"))
tw.corpus = tm_map(tw.corpus, removeWords, my.stopwords)
tw.corpus
}
wordcloud.generate = function(corpus, min.freq = 3) {
require(wordcloud)
doc.m = TermDocumentMatrix(corpus, control = list(minWordLength = 1))
dm = as.matrix(doc.m)
# calculate the frequency of words
v = sort(rowSums(dm), decreasing = TRUE)
d = data.frame(word = names(v), freq = v)
wc = wordcloud(d$word, d$freq, min.freq = min.freq)
wc
}
print(wordcloud.generate(generateCorpus(tweets), 7))
## Loading required package: wordcloud
## Loading required package: Rcpp
## Loading required package: RColorBrewer
## NULL
print(wordcloud.generate(generateCorpus(tweets, tolower(fstub)),
7))
## NULL
Let's look to see what tags were used in the sample four times or more:
# hashtag processing via http://stackoverflow.com/a/9360445/454773
hashtagAugment = function(tmp) {
# I think we need to defend against cases with zero tagged or untagged
# tweets?
tags <- str_extract_all(tmp$text, "#[a-zA-Z0-9]+")
index <- rep.int(seq_len(nrow(tmp)), sapply(tags, length))
if (length(index) != 0 || index) {
tagged <- tmp[index, ]
tagged$tag <- unlist(tags)
} else {
tagged = data.frame()
}
has_no_tag <- sapply(tags, function(x) length(x) == 0L)
not_tagged <- tmp[has_no_tag, ]
rbind(tagged, not_tagged)
}
df.data.t = hashtagAugment(df.data)
tag.count = data.frame(table(df.data.t$tag))
colnames(tag.count) = c("tag", "tagCount")
#
# p=ggplot(df.data.t,aes(x=na.omit(tag)))+geom_bar(aes(y=(..count..),x=reorder(tag,rep(1,length(tag)),sum)))
# + xlab(NULL) + opts(axis.text.x=theme_text(angle=-90,size=6))
p = ggplot(subset(tag.count, tagCount > 3), aes(x = na.omit(tag))) +
geom_bar(aes(y = tagCount, stat = "identity", x = reorder(tag, tagCount))) +
xlab(NULL) + opts(axis.text.x = theme_text(angle = -90))
print(p)
print(xtable(head(arrange(tag.count,desc(tagCount)),10), caption = "Top ten tags",caption.placement = "top"),'html')
| tag | tagCount | |
|---|---|---|
| 1 | #RNchat | 63 |
| 2 | #rnchat | 53 |
| 3 | #NurSM | 19 |
| 4 | #RNChat | 19 |
| 5 | #NurseTogetherChat | 16 |
| 6 | #hcsm | 16 |
| 7 | #meded | 15 |
| 8 | #nurse | 11 |
| 9 | #nurses | 10 |
| 10 | #MDchat | 8 |