library(academictwitteR) # get_all_tweets()
library(rtweet) # save_as_csv() that prepends numerical ids as characters
library(dplyr) # %>% convenient data cleaning

Input bearer token and filepattern

Read in Tweets Data

  1. List all files that meet the search patterns

  2. Read in all the files as a list

  3. Use do.call(what = "rbind", args = lapply(nlist, as.data.frame)) to unlist the files into one dataframe tweet

Get File Names

myfiles <- list.files(path = getwd(), pattern = filepattern, full.names = F)
paste(substr(myfiles, 1, 2), "****", substr(myfiles, nchar(myfiles)-7, nchar(myfiles)), sep = "") # file names
##  [1] "gc****1T19.csv" "gc****7T18.csv" "gc****6T00.csv" "gc****1T17.csv"
##  [5] "gc****0T15.csv" "gc****3T18.csv" "gc****6T17.csv" "gc****1T05.csv"
##  [9] "gc****3T16.csv" "gc****0T20.csv" "gc****5T00.csv" "gc****6T18.csv"
substr(file.info(myfiles)$ctime, 1, 10) # File create time
##  [1] "2023-01-16" "2023-01-16" "2023-01-16" "2023-01-16" "2023-01-16"
##  [6] "2023-01-16" "2023-01-16" "2023-01-16" "2023-01-16" "2023-01-16"
## [11] "2023-01-16" "2023-01-16"
nlist <- vector("list", length(myfiles))
for (n in 1:length(myfiles)) {
  nlist[[n]] <- read_twitter_csv(myfiles[n])
}
tweet <- do.call(what = "rbind", args = lapply(nlist, as.data.frame))
tweet <- unique(tweet);nrow(tweet) # How many rows in the tweet dataset
## [1] 80740
length(unique(tweet$status_id)) # How many unique tweet ids in the dataset? The two numbers should be close.
## [1] 79997

Get Unique IDs

userids <- c(unique(tweet$referenced_user_id),unique(tweet$author_id)) # author_id or referenced_user_id
userids <- userids[!is.na(userids)] # remove NAs. It's possible that referenced_user_id isn't available.
userids <- unique(userids) # remove duplicates
length(userids) # How many Twitter users?
## [1] 12364
round(summary(nchar(userids))) # length distribution of user ids
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       9      10      13      18      19
paste("*****", head(substr(userids, nchar(userids)-summary(nchar(userids))[1]+1, nchar(userids))), sep = "") # last few digits of some user ids
## [1] "*****28" "*****42" "*****15" "*****46" "*****86" "*****03"

There’re 12364 users.

Download Raw Data

rawuser <- get_user_profile(userids, bearer_token = bearer.token)
## Processing from 1 to 100
## Processing from 101 to 200
## Processing from 201 to 300
## Processing from 301 to 400
## Processing from 401 to 500
## Processing from 501 to 600
## Processing from 601 to 700
## Processing from 701 to 800
## Processing from 801 to 900
## Processing from 901 to 1000
## Processing from 1001 to 1100
## Processing from 1101 to 1200
## Processing from 1201 to 1300
## Processing from 1301 to 1400
## Processing from 1401 to 1500
## Processing from 1501 to 1600
## Processing from 1601 to 1700
## Processing from 1701 to 1800
## Processing from 1801 to 1900
## Processing from 1901 to 2000
## Processing from 2001 to 2100
## Processing from 2101 to 2200
## Processing from 2201 to 2300
## Processing from 2301 to 2400
## Processing from 2401 to 2500
## Processing from 2501 to 2600
## Processing from 2601 to 2700
## Processing from 2701 to 2800
## Processing from 2801 to 2900
## Processing from 2901 to 3000
## Processing from 3001 to 3100
## Processing from 3101 to 3200
## Processing from 3201 to 3300
## Processing from 3301 to 3400
## Processing from 3401 to 3500
## Processing from 3501 to 3600
## Processing from 3601 to 3700
## Processing from 3701 to 3800
## Processing from 3801 to 3900
## Processing from 3901 to 4000
## Processing from 4001 to 4100
## Processing from 4101 to 4200
## Processing from 4201 to 4300
## Processing from 4301 to 4400
## Processing from 4401 to 4500
## Processing from 4501 to 4600
## Processing from 4601 to 4700
## Processing from 4701 to 4800
## Processing from 4801 to 4900
## Processing from 4901 to 5000
## Processing from 5001 to 5100
## Processing from 5101 to 5200
## Processing from 5201 to 5300
## Processing from 5301 to 5400
## Processing from 5401 to 5500
## Processing from 5501 to 5600
## Processing from 5601 to 5700
## Processing from 5701 to 5800
## Processing from 5801 to 5900
## Processing from 5901 to 6000
## Processing from 6001 to 6100
## Processing from 6101 to 6200
## Processing from 6201 to 6300
## Processing from 6301 to 6400
## Processing from 6401 to 6500
## Processing from 6501 to 6600
## Processing from 6601 to 6700
## Processing from 6701 to 6800
## Processing from 6801 to 6900
## Processing from 6901 to 7000
## Processing from 7001 to 7100
## Processing from 7101 to 7200
## Processing from 7201 to 7300
## Processing from 7301 to 7400
## Processing from 7401 to 7500
## Processing from 7501 to 7600
## Processing from 7601 to 7700
## Processing from 7701 to 7800
## Processing from 7801 to 7900
## Processing from 7901 to 8000
## Processing from 8001 to 8100
## Processing from 8101 to 8200
## Processing from 8201 to 8300
## Processing from 8301 to 8400
## Processing from 8401 to 8500
## Processing from 8501 to 8600
## Processing from 8601 to 8700
## Processing from 8701 to 8800
## Processing from 8801 to 8900
## Processing from 8901 to 9000
## Processing from 9001 to 9100
## Processing from 9101 to 9200
## Processing from 9201 to 9300
## Processing from 9301 to 9400
## Processing from 9401 to 9500
## Processing from 9501 to 9600
## Processing from 9601 to 9700
## Processing from 9701 to 9800
## Processing from 9801 to 9900
## Processing from 9901 to 10000
## Processing from 10001 to 10100
## Processing from 10101 to 10200
## Processing from 10201 to 10300
## Processing from 10301 to 10400
## Processing from 10401 to 10500
## Processing from 10501 to 10600
## Processing from 10601 to 10700
## Processing from 10701 to 10800
## Processing from 10801 to 10900
## Processing from 10901 to 11000
## Processing from 11001 to 11100
## Processing from 11101 to 11200
## Processing from 11201 to 11300
## Processing from 11301 to 11400
## Processing from 11401 to 11500
## Processing from 11501 to 11600
## Processing from 11601 to 11700
## Processing from 11701 to 11800
## Processing from 11801 to 11900
## Processing from 11901 to 12000
## Processing from 12001 to 12100
## Processing from 12101 to 12200
## Processing from 12201 to 12300
## Processing from 12301 to 12364

Process Raw Data

The variable names id and created_at in the rawuser data frame are also used in the tweet data frame. To be precise, rename id to author_id and created_at to account_created_at.

colnames(rawuser)[which(colnames(rawuser)=="id")] = "author_id"
colnames(rawuser)[which(colnames(rawuser)=="created_at")] = "account_created_at" 

Create a new data frame newuser from rawuser. First, copy the flattened variables. Second, release the nested variables from rawuser and copy them to newuser.

newuser = rawuser[,c("author_id","account_created_at","username","name","description", "verified", "protected")]
newuser$followers_count = rawuser$public_metrics$followers_count
newuser$tweet_count = rawuser$public_metrics$tweet_count
newuser$following_count = rawuser$public_metrics$following_count

Take a look at the processed data

dim(newuser)
## [1] 11774    10
names(newuser) # What are the variables?
##  [1] "author_id"          "account_created_at" "username"          
##  [4] "name"               "description"        "verified"          
##  [7] "protected"          "followers_count"    "tweet_count"       
## [10] "following_count"
head(newuser[,c(6:10)])
##   verified protected followers_count tweet_count following_count
## 1    FALSE     FALSE             674       91768            3461
## 2    FALSE     FALSE             140         996             792
## 3    FALSE      TRUE            1735       23183            2286
## 4     TRUE     FALSE          292619       62692            2004
## 5    FALSE     FALSE             165        1795               0
## 6     TRUE     FALSE          464902        9564            1015

The processed newuser data has 11774 observations and 10 columns.

Save Data

thetitle <- paste("gcbcklog_", nrow(newuser), ".csv", sep = "")
save_as_csv(newuser, thetitle)