JMSC 6116 Lecture 2: Analyzing #MAGA and @POTUS: Using Twitter API

We start installing three new packages: “rtweet,” “httpuv,” and “plotly.” and “require” them into the envirnoment.

if (!require("rtweet")) install.packages("rtweet", dependencies = TRUE)
if (!require("httpuv")) install.packages("httpuv", dependencies = TRUE)

require("rtweet")  # load the required libraries
require("httpuv")
require("plotly")

Then, please insert your appname, consumerKey, and consumerSecret into the following “character” variables.

appname <- "YOUR APP NAME"
consumerKey <- "YOUR CONSUMER KEY"
consumerSecret <- "YOUR CONSUMER SECRET"
access_token <- "YOUR ACCESS TOKEN" 
access_secret <- "YOUR ACCESS SECRET"

Setup your Twitter token …..

twitter_token <- create_token(app = appname, consumer_key = consumerKey, consumer_secret = consumerSecret, access_token = access_token, access_secret = access_secret, set_renv =TRUE)

Once your Twitter API token is ready, we use the command “search_tweets” to search tweets containing the keyword “#MAGA.” The returned object “maga” (500 tweets, no retweets) is a data frame and [5,] and $COLUMN_NAME are used to display the 5th row and column of the data frame respectively.

maga <- search_tweets("#MAGA", n = 500, include_rts = FALSE)
class(maga) # Show its data class

## [1] "tbl_df"     "tbl"        "data.frame"

colnames(maga) # Show all its columns

##  [1] "user_id"                 "status_id"              
##  [3] "created_at"              "screen_name"            
##  [5] "text"                    "source"                 
##  [7] "display_text_width"      "reply_to_status_id"     
##  [9] "reply_to_user_id"        "reply_to_screen_name"   
## [11] "is_quote"                "is_retweet"             
## [13] "favorite_count"          "retweet_count"          
## [15] "quote_count"             "reply_count"            
## [17] "hashtags"                "symbols"                
## [19] "urls_url"                "urls_t.co"              
## [21] "urls_expanded_url"       "media_url"              
## [23] "media_t.co"              "media_expanded_url"     
## [25] "media_type"              "ext_media_url"          
## [27] "ext_media_t.co"          "ext_media_expanded_url" 
## [29] "ext_media_type"          "mentions_user_id"       
## [31] "mentions_screen_name"    "lang"                   
## [33] "quoted_status_id"        "quoted_text"            
## [35] "quoted_created_at"       "quoted_source"          
## [37] "quoted_favorite_count"   "quoted_retweet_count"   
## [39] "quoted_user_id"          "quoted_screen_name"     
## [41] "quoted_name"             "quoted_followers_count" 
## [43] "quoted_friends_count"    "quoted_statuses_count"  
## [45] "quoted_location"         "quoted_description"     
## [47] "quoted_verified"         "retweet_status_id"      
## [49] "retweet_text"            "retweet_created_at"     
## [51] "retweet_source"          "retweet_favorite_count" 
## [53] "retweet_retweet_count"   "retweet_user_id"        
## [55] "retweet_screen_name"     "retweet_name"           
## [57] "retweet_followers_count" "retweet_friends_count"  
## [59] "retweet_statuses_count"  "retweet_location"       
## [61] "retweet_description"     "retweet_verified"       
## [63] "place_url"               "place_name"             
## [65] "place_full_name"         "place_type"             
## [67] "country"                 "country_code"           
## [69] "geo_coords"              "coords_coords"          
## [71] "bbox_coords"             "status_url"             
## [73] "name"                    "location"               
## [75] "description"             "url"                    
## [77] "protected"               "followers_count"        
## [79] "friends_count"           "listed_count"           
## [81] "statuses_count"          "favourites_count"       
## [83] "account_created_at"      "verified"               
## [85] "profile_url"             "profile_expanded_url"   
## [87] "account_lang"            "profile_banner_url"     
## [89] "profile_background_url"  "profile_image_url"

maga[5,]$text  ### The text of the status

## [1] "I'm doing a quick 5k tomorrow with my sons.\n\nThat means 5 kilos of coke.\n\n#maga #potus"

maga[5,]$screen_name  ### Screen name of the user who posted this status

## [1] "IWasPotusBefore"

maga[5,"screen_name"]

## # A tibble: 1 x 1
##   screen_name    
##   <chr>          
## 1 IWasPotusBefore

maga[5:10,"screen_name"]

## # A tibble: 6 x 1
##   screen_name    
##   <chr>          
## 1 IWasPotusBefore
## 2 cheli23        
## 3 BBergquam      
## 4 grazerdotorg   
## 5 beanbaaag      
## 6 _rafaelzazueta

maga[5,]$created_at ### When this status was created

## [1] "2021-01-26 04:38:07 UTC"

maga[5,]$retweet_count ### The number of times this status has been retweeted

## [1] 0

mean(maga$followers_count) # mean followers count of the post's authors

## [1] 2512.538

median(maga$followers_count) # median followers count of the post's authors

## [1] 231

maga_h <- data.frame(table(cut(maga$followers_count,breaks = c(0,10,100,1000,10000,100000,10000000))))
p1 <- plot_ly(maga_h, x = ~Var1, y = ~Freq, text = ~Freq, type = 'bar', 
              textposition = 'auto')
p1 <- layout(p1, title = "", xaxis = list(title = "Followers"), yaxis = list(title = "Frequency"))
p1

##
## Exercise 1: Try a larger number by setting 
## Lookup the rate limit table: https://developer.twitter.com/en/docs/basics/rate-limits.html
##rt <- search_tweets("#MAGA", n = 18000)
##rt <- search_tweets("#MAGA", n = 25000, retryonratelimit = TRUE)  # beyond 15 mins rate limit
##
##
## Exercis 2: Pipes
## if (!require("dplyr")) install.packages("dplyr", dependencies = TRUE)
## maga_h <- maga$followers_count %>% cut(breaks = c(0,10,100,1000,10000,100000,10000000)) %>% table %>% data.frame
##

Next, we analyze the profile of Twitter handle @POTUS, President Of The United States (US government account) Again, the returned values are in a data frame and we can use $ to show each column.

potus <- lookup_users("POTUS")
potus$name # Name of the user

## [1] "President Biden"

potus$followers_count # Followers count

## [1] 6354084

potus$description # User's description

## [1] "46th President of the United States, husband to @FLOTUS, proud dad & pop. Tweets may be archived: https://t.co/IURuMIrzxb"

potus$location # Location

## [1] ""

It is interesting to check who are following this handle. We deploy the command “get_followers” to collect the list of its followers and use [,] and $ to display the contents of the data frame. Next, we use “lookup_users” to obtain the individual follower’s profile.

potus_folls <- get_followers("POTUS")
nrow(potus_folls)  # Look at the size of the data

## [1] 5000

head(potus_folls)  # First 6 followers

## # A tibble: 6 x 1
##   user_id            
##   <chr>              
## 1 999165397109637120 
## 2 927666200917749761 
## 3 1353777471418736641
## 4 1353900741484711938
## 5 1467757998         
## 6 1105310767115427840

potus_folls_data <- lookup_users(potus_folls$user_id) # Obtain the followers' profile
potus_folls_data[10,]$screen_name # Show a follower's screen name

## [1] "mrgnfreeman"

potus_folls_data[10,]$location # location

## [1] ""

potus_folls_data[10,]$followers_count # follower's coiunt

## [1] 5

class(potus_folls_data$location) # Check the data class of location

## [1] "character"

sort(table(potus_folls_data$location),decreasing = TRUE)[1:10]  # Check the top-10 distribution of "locations"

## 
##                   United States  Washington, DC    New York, NY Los Angeles, CA 
##            2556              52              28              26              25 
##     Chicago, IL California, USA     Houston, TX      Austin, TX     Atlanta, GA 
##              24              19              18              16              13

class(potus_folls_data$followers_count) # Check the data class of followers_count

## [1] "integer"

summary(potus_folls_data$followers_count) # Show its summary

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     2.0    21.0   342.4   115.2 89159.0

potus_h <- data.frame(table(cut(potus_folls_data$followers_count,breaks = c(0,10,100,1000,10000,100000,10000000))))

p2 <- plot_ly(potus_h, x = ~Var1, y = ~Freq, text = ~Freq, type = 'bar', 
              textposition = 'auto')
p2 <- layout(p2, title = "", xaxis = list(title = "Followers"), yaxis = list(title = "Frequency"))
p2

#
## Exercise: Try another Twitter handle. What does its follower distribution look like?  
#

Finally, let’s have a look at the Twitter trend data and check the class of the returned data

loc <- trends_available()
sf <- get_trends("San Francisco") # trending topics in san francisco
ny <- get_trends("New York") # trending topics in new york
tk <- get_trends("Tokyo") # trending topics in tokyo
kr <- get_trends("Korea") # trending topics in Korea
us <- get_trends("United States") # trending topics in US
ww <- get_trends("Worldwide") # all around the world
class(ww) # Check data class - data.frame

## [1] "tbl_df"     "tbl"        "data.frame"

Last, check the extent to which the Twitter trend of each location is shared with the worldwide one.

sum(ww$trend %in% sf$trend)/length(ww$trend) # Check if trends@WW are trends$SF

## [1] 0.22

sum(ww$trend %in% ny$trend)/length(ww$trend) # Check if trends@WW are trends$NY

## [1] 0.22

sum(ww$trend %in% tk$trend)/length(ww$trend) # Check if trends@WW are trends$TK

## [1] 0.08

sum(ww$trend %in% kr$trend)/length(ww$trend) # Check if trends@WW are trends$KR

## [1] 0

sum(ww$trend %in% us$trend)/length(ww$trend) # Check if trends@WW are trends$US

## [1] 0.22

sum(ny$trend %in% sf$trend)/length(ww$trend) # Check if trends@NY are trends$SF

## [1] 0.34

JMSC 6116 Lecture 2: Analyzing #MAGA and @POTUS: Using Twitter API

King-wa Fu

January 29, 2021