S. Hong, H. Berk, J. Hamski
May 27, 2015
Twitter is a venue for professional skills development…
…not just a place to post cat pics and complain to airlines!
So why not use Twitter to do a qualitative and quantitative analysis of the R and Python communities?
#rstats tweets
tweets <- searchTwitter("#rstats", n=1500, lang="en")
# transform tweets to data frame
df <- twListToDF(tweets)
#pydata Tweets
tweets_py <- searchTwitter("#pydata", n=1500, lang="en")
#transform tweets to data frame
df2 <- twListToDF(tweets_py)
#stats
# Use TM (text mining) package
mycorpus <- Corpus(VectorSource(df$text))
# Text transformation
fix <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
mycorpus <- tm_map(mycorpus, fix, "/")
mycorpus <- tm_map(mycorpus, fix, "@")
mycorpus <- tm_map(mycorpus, fix, "\\|")
# Remove numbers
mycorpus <- tm_map(mycorpus, removeNumbers)
# Remove english common stopwords
mycorpus <- tm_map(mycorpus, removeWords, stopwords("english"))
# Remove punctuations
mycorpus <- tm_map(mycorpus, removePunctuation)
# Eliminate extra white spaces
mycorpus <- tm_map(mycorpus, stripWhitespace)
# Remove your own stop word
# specify your stopwords as a character vector
mycorpus <- tm_map(mycorpus, removeWords,
c("http","https", "tco","amp","just","atuqiqdpj","rstats"))
# Convert the text to lower case
mycorpus <- tm_map(mycorpus, content_transformer(tolower))
#pydata
# Use TM (text mining) package
mycorpusp <- Corpus(VectorSource(df2$text))
# Text transformation
fix <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
mycorpusp <- tm_map(mycorpusp, fix, "/")
mycorpusp <- tm_map(mycorpusp, fix, "@")
mycorpusp <- tm_map(mycorpusp, fix, "\\|")
# Convert the text to lower case
mycorpusp <- tm_map(mycorpusp, content_transformer(tolower))
# Remove numbers
mycorpusp <- tm_map(mycorpusp, removeNumbers)
# Remove english common stopwords
mycorpusp <- tm_map(mycorpusp, removeWords, stopwords("english"))
# Remove punctuations
mycorpusp <- tm_map(mycorpusp, removePunctuation)
# Eliminate extra white spaces
mycorpusp <- tm_map(mycorpusp, stripWhitespace)
# Remove your own stop word
# specify your stopwords as a character vector
mycorpusp <- tm_map(mycorpusp, removeWords,
c("http","https", "tco","amp","just","pydata"))
#rstats
dtm <- TermDocumentMatrix(mycorpus)
a <- as.matrix(dtm)
b <- sort(rowSums(a),decreasing=TRUE)
c <- data.frame(word = names(b),freq=b)
head(c, 10)
#pydata
dtmp <- TermDocumentMatrix(mycorpusp)
ap <- as.matrix(dtmp)
bp <- sort(rowSums(ap),decreasing=TRUE)
cp <- data.frame(word = names(bp),freq=bp)
head(cp, 10)