ANLY 545 lab2

Read Tokens into R

consumer_key <-  'bha1fThOzpDHBkg3XcG2uV6lN'
consumer_secret <- 'lIgUZE6IZZk72SfO7MP5KQGXyTf9rLBmclcPwMWeJhjrFjNYoN'
access_token <- '772191848-JQ6F3aaEPxbMzjtYPbfsYrT5Q4f8OnCXm0nDlrjO'
access_secret <- 'oPpIK0ul0B2laMLZzaoR9dX6K89HKQpilms4LWTUWWIrX'
setup_twitter_oauth(consumer_key,consumer_secret,access_token,access_secret)

Read data from Twitter into R

num_tweets <- 100
r_stats <- searchTwitter("#HU", n=num_tweets)

Modify the tweets

#save text
r_stats_text <- sapply(r_stats, function(x) x$getText())

#create corpus - Constructs a text document collection (corpus).
r_stats_text_corpus <- Corpus(VectorSource(r_stats_text))

VectorSource stores the corpus into memory as long as R is running (vs. permanent store)
You should remove special characters

r_stats_text_corpus <- tm_map(r_stats_text_corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
r_stats_text_corpus <- tm_map(r_stats_text_corpus, content_transformer(tolower)) 
r_stats_text_corpus <- tm_map(r_stats_text_corpus, removePunctuation)
r_stats_text_corpus <- tm_map(r_stats_text_corpus, function(x)removeWords(x,stopwords()))
wordcloud(r_stats_text_corpus)

myDtm <- TermDocumentMatrix(r_stats_text_corpus, control = list(minWordLength = 1))
findFreqTerms(myDtm, lowfreq=10)    # Find the data with given frequency

Need user ID information

#r_stats <- userTimeline("Trump", n=100)  #n is The maximum number of tweets to return
me <- getUser("realdonaldtrump")
me$getId() #25073877
getUser(25073877)

Followers information

followersIds <- lookupUsers(me$getFollowerIDs())
length(followersIds)
followers <- me$getFollowers() ##returns a list
#followersIds[[1]]

Extra informations from Tweets

We can see the tweets trended around the world

trend <- availableTrendLocations()
head(trend)
trend <- getTrends(1)

Example HU tweets analyzing

Analyze the followers of HU
Let’s store the information of the followers into a data fram (friends_df)

user <- getUser("HarrisburgU")
friends <- user$getFriends() # who HU follows
friends_df <- twListToDF(friends)
save(friends_df, file = "hu_friends.RData")

followers <- user$getFollowers() # HU followers
followers_df <- twListToDF(followers)
save(followers_df, file = "hu_followers.RData")

Some analysis of twitter

Use the information stored in friends_df to answer these questions…

What language do HU friends speak?
Draw the distribution of friends
How active HU friends are?
Who are my followers with the biggest network and who tweet the most?
Is there a correlation between number of followers and number of tweets?
Optional: What are the most commonly used words in HU friends followers’ descriptions? (Tidytext package)
Optional: Are HU followers talk positively or negatively abut HU?

load("hu_friends.RData")

# 1.
friends_df %>% pull(lang) %>% as.factor() %>% summary()

##  en 
## 147

# all English

# 2. 
friends_df %>% 
  ggplot(aes(friendsCount)) +
  geom_density()

# 3.
# how active? - check the distribution of the number of their tweets
friends_df %>% 
  ggplot(aes(x = statusesCount)) + 
  geom_density()

# looks like that most are not the super active, but this is may be the actual
# situation in social media

# 4.
friends_df %>% 
  arrange(followersCount) %>% 
  slice(c(n())) %>% 
  select(followersCount, name)

# the biggest network: NASA
friends_df %>% 
  arrange(statusesCount) %>% 
  slice(c(n())) %>% 
  select(statusesCount, name)

# the one tweet the most: WGAL

# 5.
# plot to see if it's likely to have correlation
friends_df %>% 
  ggplot(aes(x = followersCount, y = statusesCount)) +
  geom_point()

cor.test(x = friends_df$followersCount, y = friends_df$statusesCount)

## 
##  Pearson's product-moment correlation
## 
## data:  friends_df$followersCount and friends_df$statusesCount
## t = 1.1467, df = 145, p-value = 0.2534
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06813745  0.25281474
## sample estimates:
##        cor 
## 0.09480159

# cannot reject the NULL hypotheeis that the correlation is 0

ANLY 545 lab2

Lianguan Zhou

2018-05-06