Funda Demo

Data Preparation

Before starting to analyze tweets, we will first load a few source files (libraries) in this project.

# check working directory
getwd()

# note that Knitr automatically sets wd to where the Rmd file is.  so if you
# wish to run code line-by-line, you should setwd mannually.
# setwd('/home/bodong/src/r/twitter-analytics/twitter-hashtag-analytics')

# load source files
source("utils/utilities.R", local = TRUE)
source("utils/get_tweets.R")
source("utils/munge_tweets.R")
source("utils/semantic_analysis.R")
source("utils/social_analysis.R")

Retrieve an archive from Google Spreadsheet archives (see here).

# get tweets by search this function is defined in get_tweets.R df <-
# GetTweetsFromGoogleDrive('0Aup6zwZoYbZ1dE1PZThqNTJwdUdHdEdqSDBUWkNfdXc')

# save or load data (so you can reuse data rather than search all the time)
# save(df, file='./data/funda.Rda')
load("data/funda.Rda")

# structure of df
str(df)

## 'data.frame':    7266 obs. of  19 variables:
##  $ id_str                   : num  3.78e+17 3.78e+17 3.78e+17 3.78e+17 3.78e+17 ...
##  $ from_user                : chr  "Loolaan_27_93" "huisX" "huisX" "huisX" ...
##  $ text                     : chr  "We hebben afgelopen maand 2 bezichtigingen gehad! Dus de huizenmarkt trekt weer aan. Twijfel je? Kijken is gratis!" "Brug keert terug (Nieuws) http://t.co/EpJuvwYc92 #huizenmarkt #economie" "Grondulow ontspruit aan boerenbrein (Nieuws) http://t.co/fHtVPTmgQW #huizenmarkt #economie" "Essent dingt met energiemaatregelen naar prijs (Nieuws) http://t.co/03JoljLK4u #huizenmarkt #economie" ...
##  $ created_at               : chr  "2013-09-11 00:00:01" "2013-09-10 23:41:52" "2013-09-10 23:41:52" "2013-09-10 23:41:51" ...
##  $ time                     : chr  "11/09/2013 07:00:01" "11/09/2013 06:41:52" "11/09/2013 06:41:52" "11/09/2013 06:41:51" ...
##  $ geo_coordinates          : chr  NA NA NA NA ...
##  $ user_lang                : chr  "nl" "nl" "nl" "nl" ...
##  $ in_reply_to_user_id_str  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ in_reply_to_screen_name  : chr  "" "" "" "" ...
##  $ from_user_id_str         : int  733140816 245975426 245975426 245975426 245975426 245975426 245975426 245975426 362725009 228177539 ...
##  $ in_reply_to_status_id_str: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ source                   : chr  "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://ifttt.com\" rel=\"nofollow\">IFTTT</a>" "<a href=\"http://ifttt.com\" rel=\"nofollow\">IFTTT</a>" "<a href=\"http://ifttt.com\" rel=\"nofollow\">IFTTT</a>" ...
##  $ profile_image_url        : chr  "http://a0.twimg.com/profile_images/3717487764/55dc46850fd238bed3d39d7116e02268_normal.jpeg" "http://a0.twimg.com/profile_images/1586141870/logo_twitter_normal.png" "http://a0.twimg.com/profile_images/1586141870/logo_twitter_normal.png" "http://a0.twimg.com/profile_images/1586141870/logo_twitter_normal.png" ...
##  $ user_followers_count     : int  114 1618 1618 1618 1618 1618 1618 1618 391 979 ...
##  $ user_friends_count       : int  511 1773 1773 1773 1773 1773 1773 1773 1107 24 ...
##  $ user_utc_offset          : int  10800 7200 7200 7200 7200 7200 7200 7200 7200 NA ...
##  $ status_url               : chr  "http://twitter.com/Loolaan_27_93/statuses/377672861403852800" "http://twitter.com/huisX/statuses/377668293156691968" "http://twitter.com/huisX/statuses/377668291730616320" "http://twitter.com/huisX/statuses/377668290317148161" ...
##  $ entities_str             : chr  "{\"symbols\":[],\"urls\":[],\"hashtags\":[],\"user_mentions\":[]}" "{\"symbols\":[],\"urls\":[{\"expanded_url\":\"http://bit.ly/17Pfn2z\",\"indices\":[26,48],\"display_url\":\"bit.ly/17Pfn2z\",\""| __truncated__ "{\"symbols\":[],\"urls\":[{\"expanded_url\":\"http://bit.ly/1d4VWEc\",\"indices\":[45,67],\"display_url\":\"bit.ly/1d4VWEc\",\""| __truncated__ "{\"symbols\":[],\"urls\":[{\"expanded_url\":\"http://bit.ly/17Pfn2v\",\"indices\":[56,78],\"display_url\":\"bit.ly/17Pfn2v\",\""| __truncated__ ...
##  $ screen_name              : chr  "Loolaan_27_93" "huisX" "huisX" "huisX" ...

This dataset contains 7266 tweets posted by 1987 unique Twitter users between 2013-06-22 07:25:00 and 2013-09-11 00:00:01.

# preprocessing
df <- df[!duplicated(df$id_str), ]
df <- df[1:1000, ]
df <- PreprocessTweets(df)

# structure of df
str(df)

## 'data.frame':    1000 obs. of  25 variables:
##  $ id_str                   : num  3.78e+17 3.78e+17 3.78e+17 3.78e+17 3.78e+17 ...
##  $ from_user                : chr  "Loolaan_27_93" "huisX" "huisX" "huisX" ...
##  $ text                     : chr  "We hebben afgelopen maand 2 bezichtigingen gehad! Dus de huizenmarkt trekt weer aan. Twijfel je? Kijken is gratis!" "Brug keert terug (Nieuws) http://t.co/EpJuvwYc92 #huizenmarkt #economie" "Grondulow ontspruit aan boerenbrein (Nieuws) http://t.co/fHtVPTmgQW #huizenmarkt #economie" "Essent dingt met energiemaatregelen naar prijs (Nieuws) http://t.co/03JoljLK4u #huizenmarkt #economie" ...
##  $ created_at               : chr  "2013-09-11 00:00:01" "2013-09-10 23:41:52" "2013-09-10 23:41:52" "2013-09-10 23:41:51" ...
##  $ time                     : chr  "11/09/2013 07:00:01" "11/09/2013 06:41:52" "11/09/2013 06:41:52" "11/09/2013 06:41:51" ...
##  $ geo_coordinates          : chr  NA NA NA NA ...
##  $ user_lang                : chr  "nl" "nl" "nl" "nl" ...
##  $ in_reply_to_user_id_str  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ in_reply_to_screen_name  : chr  "" "" "" "" ...
##  $ from_user_id_str         : int  733140816 245975426 245975426 245975426 245975426 245975426 245975426 245975426 362725009 228177539 ...
##  $ in_reply_to_status_id_str: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ source                   : chr  "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://ifttt.com\" rel=\"nofollow\">IFTTT</a>" "<a href=\"http://ifttt.com\" rel=\"nofollow\">IFTTT</a>" "<a href=\"http://ifttt.com\" rel=\"nofollow\">IFTTT</a>" ...
##  $ profile_image_url        : chr  "http://a0.twimg.com/profile_images/3717487764/55dc46850fd238bed3d39d7116e02268_normal.jpeg" "http://a0.twimg.com/profile_images/1586141870/logo_twitter_normal.png" "http://a0.twimg.com/profile_images/1586141870/logo_twitter_normal.png" "http://a0.twimg.com/profile_images/1586141870/logo_twitter_normal.png" ...
##  $ user_followers_count     : int  114 1618 1618 1618 1618 1618 1618 1618 391 979 ...
##  $ user_friends_count       : int  511 1773 1773 1773 1773 1773 1773 1773 1107 24 ...
##  $ user_utc_offset          : int  10800 7200 7200 7200 7200 7200 7200 7200 7200 NA ...
##  $ status_url               : chr  "http://twitter.com/Loolaan_27_93/statuses/377672861403852800" "http://twitter.com/huisX/statuses/377668293156691968" "http://twitter.com/huisX/statuses/377668291730616320" "http://twitter.com/huisX/statuses/377668290317148161" ...
##  $ entities_str             : chr  "{\"symbols\":[],\"urls\":[],\"hashtags\":[],\"user_mentions\":[]}" "{\"symbols\":[],\"urls\":[{\"expanded_url\":\"http://bit.ly/17Pfn2z\",\"indices\":[26,48],\"display_url\":\"bit.ly/17Pfn2z\",\""| __truncated__ "{\"symbols\":[],\"urls\":[{\"expanded_url\":\"http://bit.ly/1d4VWEc\",\"indices\":[45,67],\"display_url\":\"bit.ly/1d4VWEc\",\""| __truncated__ "{\"symbols\":[],\"urls\":[{\"expanded_url\":\"http://bit.ly/17Pfn2v\",\"indices\":[56,78],\"display_url\":\"bit.ly/17Pfn2v\",\""| __truncated__ ...
##  $ screen_name              : chr  "Loolaan_27_93" "huisX" "huisX" "huisX" ...
##  $ reply_to                 : chr  NA NA NA NA ...
##  $ retweet_from             : chr  NA NA NA NA ...
##  $ links                    : chr  NA "http://t.co/EpJuvwYc92" "http://t.co/fHtVPTmgQW" "http://t.co/03JoljLK4u" ...
##  $ longlinks                : chr  "" "http://t.co/EpJuvwYc92" "http://t.co/fHtVPTmgQW" "http://t.co/03JoljLK4u" ...
##  $ linkTitle                : chr  "" "http://bit.ly/17Pfn2z" "http://bit.ly/1d4VWEc" "http://bit.ly/17Pfn2v" ...
##  $ text_nourl               : chr  "We hebben afgelopen maand 2 bezichtigingen gehad! Dus de huizenmarkt trekt weer aan. Twijfel je? Kijken is gratis!" "Brug keert terug (Nieuws)  #huizenmarkt #economie" "Grondulow ontspruit aan boerenbrein (Nieuws)  #huizenmarkt #economie" "Essent dingt met energiemaatregelen naar prijs (Nieuws)  #huizenmarkt #economie" ...

Start from Easy Stuff: Count Things

Count tweets, retweets (by), and replies (to) for each user

Regular statuses, retweets, and replies are three main types of tweets we analyze. The GetTweetCountTable function can easily count total tweets sent by a user, times of retweeting by other users, and number of replies a user has received.

EnsurePackage("ggplot2")
EnsurePackage("reshape2")

# Count tables
countTweets <- GetTweetCountTable(df, "from_user")
countRetweets <- GetTweetCountTable(df, "retweet_from")
countReplies <- GetTweetCountTable(df, "reply_to")

# quickly check distribution of tweets per user
qplot(countTweets$count, binwidth = 1, xlab = "Number of Tweets")

plot of chunk counttables


# combine counts into one data frame
counts <- merge(countTweets, countRetweets, by = "user", all.x = TRUE)
counts <- merge(counts, countReplies, by = "user", all.x = TRUE)
colnames(counts) <- c("user", "tweets", "replied_to", "retweeted_by")
counts[is.na(counts)] <- 0

# melt data
counts.melt <- melt(counts, id.vars = c("user"))

# plot (Cleveland dot plot)
ggplot(counts.melt, aes(x = user, y = value, color = variable)) + geom_point() + 
    coord_flip() + ggtitle("Counts of tweets, retweets, and messages") + xlab("Counts") + 
    ylab("Users")

plot of chunk counttables

Ratio of retweets to tweets

To get a sense how received or valued one's tweets were within the community, we can further count the ratio of being retweeted by other users to sent tweets.

# create new column 'ratio'
counts$ratio <- counts$retweeted_by/counts$tweets

# plot ratio for users who have at least one rt
ggplot(counts[counts$retweeted_by > 0, ], aes(x = reorder(user, ratio), y = ratio)) + 
    geom_point() + coord_flip() + ggtitle("Ratio of retweets to tweets") + xlab("Users") + 
    ylab("Retweets/Tweets ratio")

plot of chunk ratio

Count URLs

URLs embedded in tweets are important because they usually link to important resources that are of interest to this community.

# count links
countLinks <- GetTweetCountTable(df, "links")
names(countLinks)[1] <- "url"

# check top links
head(countLinks[with(countLinks, order(-count)), ])

##                        url count
## 507 http://t.co/QNIIbXWGOJ     8
## 738 http://t.co/YJUepeNTAY     7
## 11  http://t.co/0Wr3iWEBhB     6
## 58  http://t.co/4IOrrSshmE     6
## 204 http://t.co/CKvEFZrMR6     6
## 427 http://t.co/MVlc6n4Dkn     5


# plot to see distribution of links
ggplot(countLinks[countLinks$count > 1, ], aes(reorder(url, count), count)) + 
    geom_point() + coord_flip() + xlab("URL") + ylab("Number of messages containing the URL")

plot of chunk counturls

Social Network Analysis (SNA)

Visualize social networks

An archived tweet dataset contains retweeting and replying as two main type of links among users. Some studies looks into following relations, which require further queries to Twitter. So in this demo, we focus on retweeting and replying links.

The CreateSNADataFrame function in social_analysis.R provides an easy way to create a data frame containing all edges of the requested social network. With created edges, we can easily create an SNA graph and visualize it with packages like igraph and sna.

# create data frame
rt.df <- CreateSNADataFrame(df, from = "from_user", to = "retweet_from", linkNames = "rt")
rp.df <- CreateSNADataFrame(df, from = "from_user", to = "reply_to", linkNames = "rp")

# begin social network analysis plotting
EnsurePackage("igraph")
EnsurePackage("sna")
EnsurePackage("Matrix")
EnsurePackage("SparseM")

# create graph data frame (igraph)
g <- graph.data.frame(rt.df, directed = TRUE)

# plot with igraph (quick and dirty)
plot.igraph(g)

plot of chunk sna


# plot with sna get adjacency matrix
mat <- get.adjacency(g)
# convert to csr matrix provided by SparseM ref:
# http://cos.name/cn/topic/108758
mat.csr <- as.matrix.csr(mat, ncol = ncol(mat))

# plot with sna
gplot(mat.csr)

## Warning: 'x' is NULL so the result will be NULL

plot of chunk sna

Basic SNA measures

We can further compute some basic SNA measures. For instance, density of this network is 0.0043, reciprocity of users in the network is 0.9914, and degree centralization of this network is 0.0655. These measures are calculated as below.

# density
gden(mat.csr)

## [1] 0.00428


# reciprocity
grecip(mat.csr)

##    Mut 
## 0.9914


# centralization
centralization(mat.csr, sna::degree)

## [1] 0.0655

Community detection

A regular task in SNA is to identify communities in a network. We can do it through the walktrap.community function in igraph package.

g.wc <- walktrap.community(g, steps = 1000, modularity = TRUE)

# number of communities
length(g.wc)

## [1] 44

# sizes of communities
sizes(g.wc)

## Community sizes
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  6 35 25  5  7  7  5  4  4  4  4  3  3  3  3  3  3  6  2  2  2  2  2  2  2 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 
##  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2

# plot
plot(as.dendrogram(g.wc))

plot of chunk detectcommunity

We have detected 44 communities in this network. The largest community contains 9.162% of all users in this dataset.

Univariate Conditional Uniform Graph Tests

In network analysis, people do types of tests to check whether some aspects of a network are unusual. We can do such tests, namely conditional uniform graph tests, through the cug.test function in the sna package. Further information about these tests can be found here.

# density
cug.gden <- cug.test(mat.csr, gden)
plot(cug.gden)

plot of chunk cug

range(cug.gden$rep.stat)

## [1] 0.4903 0.5086


# reciprocity
cug.recip <- cug.test(mat.csr, grecip)
plot(cug.recip)

plot of chunk cug

range(cug.recip$rep.stat)

## [1] 0.4862 0.5113


# transistivity
cug.gtrans <- cug.test(mat.csr, gtrans)
plot(cug.gtrans)

plot of chunk cug

range(cug.gtrans$rep.stat)

## [1] 0.4903 0.5097


# centralisation
cug.cent <- cug.test(mat.csr, centralization, FUN.arg = list(FUN = degree))
plot(cug.cent)

plot of chunk cug

range(cug.cent$rep.stat)

## [1] 0.04791 0.11274

Semantic Analysis

Words

Firstly, make a word cloud.

# construct corpus, with regular preprocessing performed
corpus <- ConstructCorpus(df$text, removeTags = TRUE, removeUsers = TRUE)

# make a word cloud
MakeWordCloud(corpus)

## Warning: huizenmarkt could not be fit on page. It will not be plotted.
## Warning: font width unknown for character 0x80 Warning: font width unknown
## for character 0x80 Warning: font metrics unknown for character 0x80
## Warning: font width unknown for character 0x82 Warning: font width unknown
## for character 0x99 Warning: font width unknown for character 0x7f Warning:
## font width unknown for character 0x82 Warning: font width unknown for
## character 0x7f Warning: font metrics unknown for character 0x82 Warning:
## font width unknown for character 0x80 Warning: font width unknown for
## character 0x80 Warning: font metrics unknown for character 0x80 Warning:
## font width unknown for character 0x80 Warning: font width unknown for
## character 0x80 Warning: font metrics unknown for character 0x80 Warning:
## font width unknown for character 0x80 Warning: font width unknown for
## character 0x99 Warning: font width unknown for character 0x7f Warning:
## font width unknown for character 0x80 Warning: font width unknown for
## character 0x7f Warning: font metrics unknown for character 0x80 Warning:
## font width unknown for character 0x80 Warning: font width unknown for
## character 0x80 Warning: font metrics unknown for character 0x80 Warning:
## font metrics unknown for character 0x99 Warning: font width unknown for
## character 0x80 Warning: font width unknown for character 0x80 Warning:
## font metrics unknown for character 0x80 Warning: font width unknown for
## character 0x80 Warning: font width unknown for character 0x99 Warning:
## font width unknown for character 0x7f Warning: font width unknown for
## character 0x80 Warning: font width unknown for character 0x7f Warning:
## font metrics unknown for character 0x80 Warning: font metrics unknown for
## character 0x9c Warning: font width unknown for character 0x82 Warning:
## font width unknown for character 0x99 Warning: font width unknown for
## character 0x7f Warning: font width unknown for character 0x82 Warning:
## font width unknown for character 0x7f Warning: font metrics unknown for
## character 0x82 Warning: font width unknown for character 0x80 Warning:
## font width unknown for character 0x80 Warning: font metrics unknown for
## character 0x80 Warning: font width unknown for character 0x80 Warning:
## font width unknown for character 0x16 Warning: font width unknown for
## character 0x99 Warning: font width unknown for character 0x7f Warning:
## font width unknown for character 0x80 Warning: font width unknown for
## character 0x7f Warning: font metrics unknown for character 0x80

plot of chunk wordcloud

## NULL

This task first uses ConstructCorpus in semantic_analysis.R to create a text corpus, and then uses MakeWordCloud to make a word cloud. Please note that ConstructCorpus provides a number of options such as whether to remove hashtags (#tag) or users (@user) embedded in tweets.

Next we are going to create a term-document matrix for some quick similarity computation.

# create a term document matrix only keep tokens longer than three
# characters
td.mat <- TermDocumentMatrix(corpus, control = list(minWordLength = 3))
# have a quick look
td.mat

## A term-document matrix (3375 terms, 1000 documents)
## 
## Non-/sparse entries: 9429/3365571
## Sparsity           : 100%
## Maximal term length: 53 
## Weighting          : term frequency (tf)


# frequent words
findFreqTerms(td.mat, lowfreq = 10)

##   [1] "2008"               "2009"               "50000"             
##   [4] "â\u0082130"         "aan"                "als"               
##   [7] "architectenweb"     "banken"             "beter"             
##  [10] "bij"                "binnen"             "blijft"            
##  [13] "bouw"               "consumenten"        "corporaties"       
##  [16] "crisis"             "daling"             "dan"               
##  [19] "dat"                "deze"               "die"               
##  [22] "dit"                "door"               "dus"               
##  [25] "economie"           "een"                "eens"              
##  [28] "effect"             "eigen"              "energieakkoord"    
##  [31] "euro"               "gaan"               "gaat"              
##  [34] "gedaald"            "geen"               "geld"              
##  [37] "gemeente"           "gemiddelde"         "goed"              
##  [40] "goede"              "gratis"             "haag"              
##  [43] "hebben"             "heeft"              "herstel"           
##  [46] "het"                "hoe"                "httptcoudsyâ\u0080"
##  [49] "huis"               "huizen"             "huizencrisis"      
##  [52] "huizenmarkt"        "huizenprijzen"      "hypotheek"         
##  [55] "hypotheekbank"      "hypotheekplan"      "hypotheken"        
##  [58] "iets"               "inflatie"           "ing"               
##  [61] "jaar"               "jan"                "jaren"             
##  [64] "juli"               "kabinet"            "kan"               
##  [67] "kant"               "komen"              "komt"              
##  [70] "koop"               "kost"               "leidt"             
##  [73] "maand"              "maar"               "mee"               
##  [76] "meer"               "met"                "miljard"           
##  [79] "miljoen"            "minder"             "misschien"         
##  [82] "moet"               "moeten"             "mooi"              
##  [85] "naar"               "nationale"          "nederland"         
##  [88] "nederlandse"        "net"                "niet"              
##  [91] "nieuwe"             "nieuws"             "nog"               
##  [94] "ondervroeg"         "onderzoek"          "ons"               
##  [97] "ook"                "opleving"           "pensioenfondsen"   
## [100] "plan"               "prijs"              "procent"           
## [103] "rentedaling"        "ruim"               "schenken"          
## [106] "sentiment"          "sinds"              "slecht"            
## [109] "somber"             "stappen"            "starters"          
## [112] "steeds"             "tijd"               "ton"               
## [115] "tot"                "trekt"              "tussen"            
## [118] "uit"                "van"                "vandaag"           
## [121] "veel"               "verkocht"           "verkopen"          
## [124] "vijf"               "volgens"            "voor"              
## [127] "vraag"              "waarde"             "wat"               
## [130] "weekoverzicht"      "weer"               "wel"               
## [133] "wie"                "wil"                "wilt"              
## [136] "woning"             "woningen"           "woningmarkt"       
## [139] "woningwaarde"       "wordt"              "zag"               
## [142] "zie"                "zijn"


# find related words of a word
findAssocs(td.mat, "huizenmarkt", 0.5)

## numeric(0)

For more advanced similarity computation among documents and terms, I am considering adding Latent Semantic Analysis (LSA) capability into this package in the future.

Topic modelling with Latent Dirichlet Allocation (LDA)

With the sparse term-document matrix created above, we can use the TrainLDAModel function in semantic_analysis.R to train a LDA model. (Note: I don't understand all of steps in the code in TrainLDAModel refactored from Ben Marwick's repo. So please help to check it if you understand LDA.) This step may take a while depending on the size of the dataset.

# timing start
ptm <- proc.time()

# generate a LDA model
lda <- TrainLDAModel(td.mat)

# time used
proc.time() - ptm

##    user  system elapsed 
## 587.033   0.192 589.224

ThiS LDA model contains 20 topics. We can check keywords in each topic, get relevant topics of each tweet, and compute similarity scores among tweets based on topics they are related to.

# get keywords for each topic
lda_terms <- get_terms(lda, 5)
# look at the first 5 topics
lda_terms[, 1:5]

##      Topic 1       Topic 2       Topic 3       Topic 4        Topic 5     
## [1,] "consumenten" "een"         "nederlandse" "aan"          "gaat"      
## [2,] "beter"       "die"         "hebben"      "woning"       "naar"      
## [3,] "somber"      "procent"     "herstel"     "miljard"      "komt"      
## [4,] "50000"       "het"         "sinds"       "woningwaarde" "goede"     
## [5,] "ondervroeg"  "huizenmarkt" "moet"        "â\u0082130"   "hypotheken"


# gets topic numbers per document
lda_topics <- get_topics(lda, 5)
# look at the first 10 documents
lda_topics[, 1:10]

##       1  2  3  4  5  6  7  8  9 10
## [1,]  3 19  4  5 13 19 19 19  3  3
## [2,] 13  1 19  9  9 15  1 15  9  9
## [3,]  4  2  1 12 19 10  2  1 16 16
## [4,] 16  3  2 19  1  7  3  2  1  1
## [5,] 17  4  3  1  2  1  4  3  2  2


# compute similarity between two documents
CosineSimilarity(lda_topics[, 1], lda_topics[, 10])

##        [,1]
## [1,] 0.4712


# computer a similarity matrix of documents
sim.mat <- sapply(1:ncol(lda_topics), function(i) {
    sapply(1:ncol(lda_topics), function(j) CosineSimilarity(lda_topics[, i], 
        lda_topics[, j]))
})

# find most relevant tweets for a tweet
index <- 1
ids <- which(sim.mat[, index] > quantile(sim.mat[, index], 0.9))
sim.doc.df <- data.frame(id = ids, sim = sim.mat[, index][ids])
sim.doc.df <- sim.doc.df[with(sim.doc.df, order(-sim)), ]
# indices of most relevant tweets
head(sim.doc.df$id)

## [1]   1  12 920 146 687 269

Sentiment Analysis

This project implements three methods (with one method that depends on ViralHeat not working) of analyzing sentiment of tweets. Let's try function ScoreSentiment in sentiment_analysis.R implemented based on this post.

# compute sentiment scores for all tweets
scores <- ScoreSentiment(df$text, .progress = "text")

# plot scores
ggplot(scores, aes(x = score)) + geom_histogram(binwidth = 1) + xlab("Sentiment score") + 
    ylab("Frequency") + ggtitle("Sentiment Analysis of Tweets")

plot of chunk sentiment


scores <- scores[with(scores, order(-score)), ]
# check happy tweets
as.character(head(scores$text, 3))
# check unhappy tweets
as.character(tail(scores$text, 3))

# check sentiment scores of tweets containing certain words create subset
# based on tweets with certain words, e.g., learning
scores.sub <- subset(scores, regexpr("huizenmarkt", scores$text) > 0)
# plot histogram for this token
ggplot(scores.sub, aes(x = score)) + geom_histogram(binwidth = 1) + xlab("Sentiment score for the token 'huizenmarkt'") + 
    ylab("Frequency")

plot of chunk sentiment

Since most tweets are written in a language other than English, sentiment scores here (computed based on an English dictionary) are not really useful.