Social media has become a key communication platform for professional skills development, in addition to a place to share selfies and rants. For our final project, we aim to analyze the R Project for Statistical Computing (R) community on Twitter to uncover insights into how influence is held and shared. This project will also showcase a variety of skills developed through IS607, including the use of Rest APIs, the R programming language and the neo4j graph database.

library("twitteR")
library("wordcloud")
library("tm")
library("sentiment")
library("ggplot2")
library("Rgraphviz")

key <- scan("C:/Users/Public/api_key.txt", what="character") 

consumer_key <- key[1] 
consumer_secret <- key[2] 
access_token <- key[3] 
access_secret <- key[4] 
setup_twitter_oauth(consumer_key, 
                    consumer_secret, 
                    access_token, 
                    access_secret)

## [1] "Using direct authentication"

Rstats Tweets

tweets <- searchTwitter("#rstats", n=1500, lang="en")
# transform tweets to data frame
df <- twListToDF(tweets)

Pydata Tweets

tweets_py <- searchTwitter("#pydata", n=1500, lang="en")
#transform tweets to data frame
df2 <- twListToDF(tweets_py)

Credit for the Word Cloud and clean-up code goes to the following link: http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know#load-the-text

R Community Tweets

Clean dataset

# Use TM (text mining) package
mycorpus <- Corpus(VectorSource(df$text))

# Text transformation
fix <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
mycorpus <- tm_map(mycorpus, fix, "/")
mycorpus <- tm_map(mycorpus, fix, "@")
mycorpus <- tm_map(mycorpus, fix, "\\|")

# Remove numbers
mycorpus <- tm_map(mycorpus, removeNumbers)
# Remove english common stopwords
mycorpus <- tm_map(mycorpus, removeWords, stopwords("english"))
# Remove punctuations
mycorpus <- tm_map(mycorpus, removePunctuation)
# Eliminate extra white spaces
mycorpus <- tm_map(mycorpus, stripWhitespace)
# Remove your own stop word
# specify your stopwords as a character vector
mycorpus <- tm_map(mycorpus, removeWords, 
                   c("http","https", "tco","amp","just","atuqiqdpj","rstats")) 
# Convert the text to lower case
mycorpus <- tm_map(mycorpus, content_transformer(tolower))

Build term doc matrix

dtm <- TermDocumentMatrix(mycorpus)
a <- as.matrix(dtm)
b <- sort(rowSums(a),decreasing=TRUE)
c <- data.frame(word = names(b),freq=b)
head(c, 10)

##                    word freq
## datascience datascience  282
## rstats           rstats  260
## data               data  247
## new                 new  222
## package         package  208
## bigdata         bigdata  176
## kirkdborne   kirkdborne  153
## analytics     analytics  143
## python           python  125
## rbloggers     rbloggers  114

Cluster Dendrogram

#remove sparse terms
dtm2 <- removeSparseTerms(dtm, sparse = 0.95)
m2 <- as.matrix(dtm2)
#cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method = "ward.D2")
plot(fit)
rect.hclust(fit, k=6) #cut tree into 6 clusters

Word Cloud

set.seed(1234)
wordcloud(words = c$word, freq = c$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Find word association

findAssocs(dtm, terms = "statistics", corlimit = 0.4)

## $statistics
## numeric(0)

Words frequency bar chart

barplot(c[1:10,]$freq, las = 2, names.arg = c[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

Word association graph

(freq.terms <- findFreqTerms(dtm, lowfreq = 45))

##  [1] "analysis"       "analyticbridge" "analytics"      "announcement"  
##  [5] "bigdata"        "cheat"          "check"          "code"          
##  [9] "data"           "datascience"    "dataviz"        "eddelbuettel"  
## [13] "gaborcsardi"    "hadleywickham"  "hard"           "how"           
## [17] "incredibly"     "jawsdkfopi"     "jzebmnqiyc"     "kirkdborne"    
## [21] "libraries"      "list"           "many"           "new"           
## [25] "now"            "open"           "package"        "packages"      
## [29] "pandas"         "programming"    "projects"       "python"        
## [33] "rbloggers"      "rocks"          "rstats"         "rstudio"       
## [37] "sample"         "sas"            "science"        "sgobxbyn"      
## [41] "sheets"         "sl<U+0085>"            "statistics"     "the"           
## [45] "tutorials"      "ucdslonz"       "use"            "useful"        
## [49] "using"          "via"            "way"            "website"

plot(dtm, term = freq.terms, corThreshold = 0.12, weighting = F, attrs=list(node=list(width=20, fontsize=24, fontcolor="blue", color="red")))

Polarity table

# Classify polarity
class_pol = classify_polarity(c, algorithm="bayes")

# Get polarity best fit
polarity = class_pol[,4]

polarity_df = data.frame(text=c, polarity=polarity, stringsAsFactors=FALSE)
qplot(polarity, data=polarity_df)

Emotion table

#classify emotion
class_emo = classify_emotion(c, algorithm="bayes", prior=1.0)

#get emotion best fit
emotion = class_emo[,7]

#substitute NA's by "unknown"
emotion[is.na(emotion)] = "unknown"

#data frame with results
emo_df = data.frame(text=c, emotion=emotion, stringsAsFactors=FALSE)

#sort data frame
emo_df = within(emo_df, emotion <- factor(emotion, levels=names(sort(table(emotion), decreasing=TRUE))))
  
#plot distribution of emotions
ggplot(emo_df, aes(x=emotion)) + geom_bar(aes(y=..count.., fill=emotion)) +
  scale_fill_brewer(palette="Dark2") + labs(x="emotion categories", y="number of tweets") + geom_text(stat="bin", aes(y = ..count..-0.5, label=..count..))

Python Community Tweets

Clean dataset

# Use TM (text mining) package
mycorpusp <- Corpus(VectorSource(df2$text))

# Text transformation
fix <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
mycorpusp <- tm_map(mycorpusp, fix, "/")
mycorpusp <- tm_map(mycorpusp, fix, "@")
mycorpusp <- tm_map(mycorpusp, fix, "\\|")

# Convert the text to lower case
mycorpusp <- tm_map(mycorpusp, content_transformer(tolower))
# Remove numbers
mycorpusp <- tm_map(mycorpusp, removeNumbers)
# Remove english common stopwords
mycorpusp <- tm_map(mycorpusp, removeWords, stopwords("english"))
# Remove punctuations
mycorpusp <- tm_map(mycorpusp, removePunctuation)
# Eliminate extra white spaces
mycorpusp <- tm_map(mycorpusp, stripWhitespace)
# Remove your own stop word
# specify your stopwords as a character vector
mycorpusp <- tm_map(mycorpusp, removeWords, 
                   c("http","https", "tco","amp","just","pydata"))

Build term doc matrix

dtmp <- TermDocumentMatrix(mycorpusp)
ap <- as.matrix(dtmp)
bp <- sort(rowSums(ap),decreasing=TRUE)
cp <- data.frame(word = names(bp),freq=bp)
head(cp, 10)

##                      word freq
## rstats             rstats   20
## python             python   16
## spark               spark    9
## talks               talks    9
## berlin             berlin    8
## pydataberlin pydataberlin    8
## week                 week    8
## bokeh               bokeh    7
## data                 data    7
## datascience   datascience    7

Cluster Dendrogram

# remove sparse terms
dtm2p <- removeSparseTerms(dtmp, sparse = 0.95)
m2p <- as.matrix(dtm2p)
# cluster terms
distMatrixp <- dist(scale(m2p))
fit <- hclust(distMatrixp, method = "ward.D2")
plot(fit)
rect.hclust(fit, k=8) #cut tree into 8 clusters

Word Cloud

set.seed(1234)
wordcloud(words = cp$word, freq = cp$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Words frequency bar chart

barplot(cp[1:10,]$freq, las = 2, names.arg = c[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

Word association graph

(freq.terms <- findFreqTerms(dtmp, lowfreq = 8))

## [1] "berlin"       "pydataberlin" "python"       "rstats"      
## [5] "spark"        "talks"        "week"

plot(dtmp, term = freq.terms, corThreshold = 0.12, weighting = F, attrs=list(node=list(width=20, fontsize=24, fontcolor="blue", color="red")))

Polarity table

# Classify polarity
class_polp = classify_polarity(cp, algorithm="bayes")

# Get polarity best fit
polarityp = class_polp[,4]

polarity_dfp = data.frame(text=cp, polarity=polarityp, stringsAsFactors=FALSE)
qplot(polarityp, data=polarity_dfp)

Emotion table

#classify emotion
class_emop = classify_emotion(cp, algorithm="bayes", prior=1.0)

#get emotion best fit
emotionp = class_emop[,7]

#substitute NA's by "unknown"
emotionp[is.na(emotionp)] = "unknown"

#data frame with results
emo_dfp = data.frame(text=cp, emotion=emotionp, stringsAsFactors=FALSE)

#sort data frame
emo_dfp = within(emo_dfp, emotionp <- factor(emotionp, levels=names(sort(table(emotionp), decreasing=TRUE))))
  
#plot distribution of emotions
ggplot(emo_dfp, aes(x=emotion)) + geom_bar(aes(y=..count.., fill=emotion)) +
  scale_fill_brewer(palette="Dark2") + labs(x="emotion categories (incl. unknown)", y="number of tweets") + geom_text(stat="bin", aes(y = ..count..-0.5, label=..count..))

Text Mining

S. Hong, H. Berk, J. Hamski

Monday, May 18, 2015

Rstats Tweets

Pydata Tweets

R Community Tweets

Clean dataset

Build term doc matrix

Cluster Dendrogram

Word Cloud

Find word association

Words frequency bar chart

Word association graph

Polarity table

Emotion table

Python Community Tweets

Clean dataset

Build term doc matrix

Cluster Dendrogram

Word Cloud

Words frequency bar chart

Word association graph

Polarity table

Emotion table