Read Tokens into R
consumer_key <- 'bha1fThOzpDHBkg3XcG2uV6lN'
consumer_secret <- 'lIgUZE6IZZk72SfO7MP5KQGXyTf9rLBmclcPwMWeJhjrFjNYoN'
access_token <- '772191848-JQ6F3aaEPxbMzjtYPbfsYrT5Q4f8OnCXm0nDlrjO'
access_secret <- 'oPpIK0ul0B2laMLZzaoR9dX6K89HKQpilms4LWTUWWIrX'
setup_twitter_oauth(consumer_key,consumer_secret,access_token,access_secret)
Read data from Twitter into R
num_tweets <- 100
r_stats <- searchTwitter("#HU", n=num_tweets)
Modify the tweets
#save text
r_stats_text <- sapply(r_stats, function(x) x$getText())
#create corpus - Constructs a text document collection (corpus).
r_stats_text_corpus <- Corpus(VectorSource(r_stats_text))
r_stats_text_corpus <- tm_map(r_stats_text_corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
r_stats_text_corpus <- tm_map(r_stats_text_corpus, content_transformer(tolower))
r_stats_text_corpus <- tm_map(r_stats_text_corpus, removePunctuation)
r_stats_text_corpus <- tm_map(r_stats_text_corpus, function(x)removeWords(x,stopwords()))
wordcloud(r_stats_text_corpus)
myDtm <- TermDocumentMatrix(r_stats_text_corpus, control = list(minWordLength = 1))
findFreqTerms(myDtm, lowfreq=10) # Find the data with given frequency
Need user ID information
#r_stats <- userTimeline("Trump", n=100) #n is The maximum number of tweets to return
me <- getUser("realdonaldtrump")
me$getId() #25073877
getUser(25073877)
Followers information
followersIds <- lookupUsers(me$getFollowerIDs())
length(followersIds)
followers <- me$getFollowers() ##returns a list
#followersIds[[1]]
Extra informations from Tweets
trend <- availableTrendLocations()
head(trend)
trend <- getTrends(1)
Example HU tweets analyzing
user <- getUser("HarrisburgU")
friends <- user$getFriends() # who HU follows
friends_df <- twListToDF(friends)
save(friends_df, file = "hu_friends.RData")
followers <- user$getFollowers() # HU followers
followers_df <- twListToDF(followers)
save(followers_df, file = "hu_followers.RData")
Some analysis of twitter
Use the information stored in friends_df to answer these questions…
load("hu_friends.RData")
# 1.
friends_df %>% pull(lang) %>% as.factor() %>% summary()
## en
## 147
# all English
# 2.
friends_df %>%
ggplot(aes(friendsCount)) +
geom_density()
# 3.
# how active? - check the distribution of the number of their tweets
friends_df %>%
ggplot(aes(x = statusesCount)) +
geom_density()
# looks like that most are not the super active, but this is may be the actual
# situation in social media
# 4.
friends_df %>%
arrange(followersCount) %>%
slice(c(n())) %>%
select(followersCount, name)
# the biggest network: NASA
friends_df %>%
arrange(statusesCount) %>%
slice(c(n())) %>%
select(statusesCount, name)
# the one tweet the most: WGAL
# 5.
# plot to see if it's likely to have correlation
friends_df %>%
ggplot(aes(x = followersCount, y = statusesCount)) +
geom_point()
cor.test(x = friends_df$followersCount, y = friends_df$statusesCount)
##
## Pearson's product-moment correlation
##
## data: friends_df$followersCount and friends_df$statusesCount
## t = 1.1467, df = 145, p-value = 0.2534
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06813745 0.25281474
## sample estimates:
## cor
## 0.09480159
# cannot reject the NULL hypotheeis that the correlation is 0