wordcloud for workout tweets

## [1] "Using direct authentication"

tw = twitteR::searchTwitter('workout', n = 1000, since = '2016-11-08', retryOnRateLimit = 1)
workout = twitteR::twListToDF(tw)
library('tm')

## Warning: package 'tm' was built under R version 3.4.4

## Loading required package: NLP

library('RColorBrewer')
library('wordcloud')
saveRDS(workout, "workout.RDS")
workout <- readRDS("workout.RDS")
workouttweet <- workout$text
clean.text = function(x)
{
  x = gsub("[^[:graph:]]", " ",x)
  x = tolower(x)
  x = gsub("rt", "", x)
  x = gsub("@\\w+", "", x)
  x = gsub("[[:punct:]]", "", x)
  x = gsub("[[:digit:]]", "", x)
  x = gsub("http\\w+", "", x)
  x = gsub("[ |\t]{2,}", "", x)
  x = gsub("^ ", "", x)
  x = gsub(" $", "", x)
  return(x)
}
workouttweet = clean.text(workouttweet)
corpus = Corpus(VectorSource(workouttweet))
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = TRUE) )
tdm = as.matrix(tdm)
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 
word_freqs = word_freqs[-(1:9)]
dm = data.frame(word=names(word_freqs), freq=word_freqs)
wordcloud(head(dm$word, 50), head(dm$freq, 50), scale=c(2, .9), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

head(word_freqs, 20)

##          need        office           car         comfo   outdoorsget 
##            53            46            45            45            45 
##     poolready summerworkout          like          will           can 
##            45            45            40            39            38 
##        korang         still         stuff   amalkanbuat         badan 
##            32            32            31            29            29 
##         berat         dalam        inilah           jer         kalau 
##            29            29            29            29            29

sentimental anaylse for workout tweets

pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
require(plyr)

## Loading required package: plyr

## Warning: package 'plyr' was built under R version 3.4.4

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:twitteR':
## 
##     id

require(stringr)

## Loading required package: stringr

## Warning: package 'stringr' was built under R version 3.4.4

require(stringi)

## Loading required package: stringi

## Warning: package 'stringi' was built under R version 3.4.4

neg.words = c(neg.words, 'wtf', 'fail')
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
  

  scores = laply(sentences, function(sentence, pos.words, neg.words) {
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    sentence = tolower(sentence)
    

    word.list = str_split(sentence, '\\s+')

    words = unlist(word.list)
    

    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    

    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    

    score = sum(pos.matches) - sum(neg.matches)
    
    return(score)
  }, pos.words, neg.words, .progress=.progress )
  
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}
sentiment.scores= score.sentiment(workouttweet, pos.words, neg.words, .progress='none')

score <- sentiment.scores$score
library(plotly)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

p <- plot_ly(x = ~score, type = "histogram")
p

## Warning: package 'bindrcpp' was built under R version 3.4.4

topic.pos = subset(sentiment.scores, score > 0)
topic.neg = subset(sentiment.scores, score < 0)
topic.neu = subset(sentiment.scores, score == 0)
negN = nrow(topic.neg)
posN = nrow(topic.pos)
neuN = nrow(topic.neu)
dftemp=data.frame(topic=c("Postive tweets", "Negative tweets" , "Neutral tweets"), 
                  number=c(posN, negN, neuN))
library(plotly)
p <- plot_ly(data=dftemp, labels = ~topic, values = ~number, type = 'pie') %>%
  layout(title = 'Pie Chart of  the percentage of Positive, Negative and Neutral tweets',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p

#Network anaysis for workout tweets

library(mclust)

## Warning: package 'mclust' was built under R version 3.4.4

## Package 'mclust' version 5.4
## Type 'citation("mclust")' for citing this R package in publications.

library(RColorBrewer)       
library(sna)

## Warning: package 'sna' was built under R version 3.4.4

## Loading required package: statnet.common

## 
## Attaching package: 'statnet.common'

## The following object is masked from 'package:base':
## 
##     order

## Loading required package: network

## Warning: package 'network' was built under R version 3.4.4

## network: Classes for Relational Data
## Version 1.13.0.1 created on 2015-08-31.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##                     Mark S. Handcock, University of California -- Los Angeles
##                     David R. Hunter, Penn State University
##                     Martina Morris, University of Washington
##                     Skye Bender-deMoll, University of Washington
##  For citation information, type citation("network").
##  Type help("network-package") to get started.

## 
## Attaching package: 'network'

## The following object is masked from 'package:plyr':
## 
##     is.discrete

## sna: Tools for Social Network Analysis
## Version 2.4 created on 2016-07-23.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##  For citation information, type citation("sna").
##  Type help(package="sna") to get started.

library(graph)

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following object is masked from 'package:statnet.common':
## 
##     order

## The following object is masked from 'package:twitteR':
## 
##     as.data.frame

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min

## 
## Attaching package: 'graph'

## The following object is masked from 'package:sna':
## 
##     degree

## The following object is masked from 'package:stringr':
## 
##     boundary

## The following object is masked from 'package:plyr':
## 
##     join

library(igraph)

## Warning: package 'igraph' was built under R version 3.4.4

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:graph':
## 
##     degree, edges, intersection, union

## The following objects are masked from 'package:BiocGenerics':
## 
##     normalize, union

## The following objects are masked from 'package:sna':
## 
##     betweenness, bonpow, closeness, components, degree,
##     dyad.census, evcent, hierarchy, is.connected, neighborhood,
##     triad.census

## The following objects are masked from 'package:network':
## 
##     %c%, %s%, add.edges, add.vertices, delete.edges,
##     delete.vertices, get.edge.attribute, get.edges,
##     get.vertex.attribute, is.bipartite, is.directed,
##     list.edge.attributes, list.vertex.attributes,
##     set.edge.attribute, set.vertex.attribute

## The following object is masked from 'package:plotly':
## 
##     groups

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(readr)

## Warning: package 'readr' was built under R version 3.4.4

library(plyr)
library(stringr)
library(stringi)
library(magrittr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:igraph':
## 
##     as_data_frame, groups, union

## The following object is masked from 'package:graph':
## 
##     union

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:twitteR':
## 
##     id, location

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

tData <- data.frame(1:length(workout$screenName),workout$screenName,workout$text)
tData$workout.screenName <- as.character(tData$workout.screenName)
tData$workout.text <- as.character(tData$workout.text)
createList <- function(tData) {
  
  # Reads data
  nData <- 
    tData %>% 
   set_colnames(c("id", "screenname", "tweet")) %>%
      tbl_df()
  
  # Extracts poster information
  retweeterPoster <- 
    nData %>%
    mutate(is_retweeted = stri_detect_regex(tweet, "(RT|via)((?:\\b\\W*@\\w+)+)")) %>%
    filter(is_retweeted) %>%
    rowwise() %>%
    do({
      # Gets retwitter
      who_retweet <- 
        stri_extract_first_regex(.$tweet, "(RT|via)((?:\\b\\W*@\\w+)+)")[[1]] %>%
        stri_extract_first_regex("@[a-zA-Z0-9_]{1,}") %>%
        stri_replace_all_fixed("@", "")
      
      # Returns pair
      data_frame(who_post = .$screenname, who_retweet = who_retweet, 
                 combi = stri_c(sort(c(.$screenname, who_retweet)), collapse = " "))
    }) %>%
    ungroup() %>%
    group_by(combi) %>%
    summarize(from = min(who_post, who_retweet), 
              to = max(who_post, who_retweet), 
              weight = n()) %>%
    ungroup() %>%
    select(-combi)
  
  # Returns results
  retweeterPoster
}

retweeterPoster <- createList(tData)
m <- ftM2adjM(ft = as.matrix(retweeterPoster[, 1:2]), W = retweeterPoster$weight, edgemode = "directed")
g1 <- as(m, "graphNEL")
node <- data.frame(nodes(g1))
node$betweenness <-  sna::betweenness(m)
node$degree <- sna::degree(m)
sortlist <- node[order(-node$degree),]
head(sortlist, 10)

##           nodes.g1. betweenness degree
## 217    InfoBoosting         345     45
## 346      twtjogging          58     31
## 317     RockiiRoadd          48     19
## 342    ThinkSarcasm          12     13
## 341     therealcoya          10     11
## 39   alyssabflowers           8      9
## 180 FitnessMagazine          18      9
## 279  muscle_fitness           3      9
## 154     drdishbball           0      8
## 145  DeluxeFitgirls           0      6

node %<>%
  mutate(size = log(node$degree)) %>%
  mutate(size = ifelse(size == -Inf, 1, size))
N = 3
node %<>%
  mutate(group = Mclust(size, G = N)$classification)
library(visNetwork)

## Warning: package 'visNetwork' was built under R version 3.4.4

gnode <- data.frame(node$nodes.g1.)
gnode<- setNames(gnode, "id")
gnode$shape <- "dot"  
gnode$shadow <- TRUE # Nodes will drop shadow
gnode$title <- node$nodes.g1. #Click to show title
gnode$label <- node$degree # Node label by degree centrality
gnode$group <- node$group
gnode$size <- gnode$group*3 # Node size by group
gnode$color.background <- c("slategrey", "tomato", "gold")[gnode$group]

visNetwork(
  gnode,
  setNames(retweeterPoster, c("from", "to", "weight"))
) %>%
 visOptions(highlightNearest = TRUE, 
             selectedBy = "group")

Untitled

Xuyang Wu

May 13, 2018

wordcloud for workout tweets

sentimental anaylse for workout tweets