Twitter 1

Harold Nelson

3/22/2022

Setup

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rtweet)
## 
## Attaching package: 'rtweet'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(httpuv)

Search_tweets

First let’s look at the code in Datacamp.

# Extract tweets on "#Emmyawards" and include retweets
twts_emmy <- search_tweets("#Emmyawards", 
                 n = 2000, 
                 include_rts = TRUE, 
                 lang = "en")

# View output for the first 5 columns and 10 rows
head(twts_emmy[,1:5], 10)
## # A tibble: 10 × 5
##    user_id             status_id           created_at          screen_name text 
##    <chr>               <chr>               <dttm>              <chr>       <chr>
##  1 1378586154618953729 1506349156679372801 2022-03-22 19:16:40 jasonfrazi… "Som…
##  2 337463933           1506267630461763595 2022-03-22 13:52:42 RickeyLami… "Do …
##  3 1216785372505657346 1505311118868336641 2022-03-19 22:31:52 marvelousl… "I’m…
##  4 2712691026          1505006755780173824 2022-03-19 02:22:26 robbe931    "Sto…
##  5 1022485724510253056 1504275336502075395 2022-03-17 01:56:02 sumnertrac  "Abo…
##  6 111443548           1504209025004720129 2022-03-16 21:32:33 sahanesers… "#Th…
##  7 2941255887          1504174186146811904 2022-03-16 19:14:06 collazo_ni… "Sto…
##  8 757250499787751424  1504165372706103297 2022-03-16 18:39:05 JohnApp634… "Sto…
##  9 501453921           1503938968349511681 2022-03-16 03:39:26 mickisuzet… "Ear…
## 10 1409602792403132417 1503699427441135617 2022-03-15 11:47:35 GunsTony2   "@Fa…

Tidy This

I would rather work with dplyr than base R. I would also prefer to get some useful information.

Use the dataframe to glimpse for the contents of a tweet and grab a few items of useful information. Then use select to extract these bits,

Solution

glimpse(twts_emmy)
## Rows: 10
## Columns: 90
## $ user_id                 <chr> "1378586154618953729", "337463933", "121678537…
## $ status_id               <chr> "1506349156679372801", "1506267630461763595", …
## $ created_at              <dttm> 2022-03-22 19:16:40, 2022-03-22 13:52:42, 202…
## $ screen_name             <chr> "jasonfrazier222", "RickeyLamitie", "marvelous…
## $ text                    <chr> "Sometimes I think Emmy’s bed is more comforta…
## $ source                  <chr> "Twitter for iPhone", "LaterMedia", "Twitter f…
## $ display_text_width      <dbl> 238, 184, 259, 140, 140, 147, 140, 140, 250, 2…
## $ reply_to_status_id      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "150348123…
## $ reply_to_user_id        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "14791044"
## $ reply_to_screen_name    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "FaithHill"
## $ is_quote                <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE…
## $ is_retweet              <lgl> FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, T…
## $ favorite_count          <int> 3, 0, 3, 0, 0, 0, 0, 0, 3, 1
## $ retweet_count           <int> 0, 0, 0, 12, 7, 23, 12, 12, 0, 0
## $ quote_count             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ reply_count             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ hashtags                <list> <"jasonfrazier", "emmy", "emmyaward", "emmyawa…
## $ symbols                 <list> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ urls_url                <list> NA, NA, "twitter.com/maiseltv/statu…", NA, NA,…
## $ urls_t.co               <list> NA, NA, "https://t.co/srk3drFKaT", NA, NA, NA,…
## $ urls_expanded_url       <list> NA, NA, "https://twitter.com/maiseltv/status/1…
## $ media_url               <list> "http://pbs.twimg.com/media/FOegkrjVgAUk--C.jp…
## $ media_t.co              <list> "https://t.co/EklmrkF3Et", "https://t.co/sVxPQ…
## $ media_expanded_url      <list> "https://twitter.com/jasonfrazier222/status/1…
## $ media_type              <list> "photo", "photo", "photo", NA, NA, NA, NA, NA…
## $ ext_media_url           <list> "http://pbs.twimg.com/media/FOegkrjVgAUk--C.j…
## $ ext_media_t.co          <list> "https://t.co/EklmrkF3Et", "https://t.co/sVxP…
## $ ext_media_expanded_url  <list> "https://twitter.com/jasonfrazier222/status/1…
## $ ext_media_type          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ mentions_user_id        <list> NA, NA, NA, <"183795890", "183795890">, "1216…
## $ mentions_screen_name    <list> NA, NA, NA, <"ZouvelosGeorge", "ZouvelosGeorg…
## $ lang                    <chr> "en", "en", "en", "en", "en", "en", "en", "en…
## $ quoted_status_id        <chr> NA, NA, "1505228103500677120", NA, NA, NA, NA…
## $ quoted_text             <chr> NA, NA, "Only Lenny could get through to Midg…
## $ quoted_created_at       <dttm> NA, NA, 2022-03-19 17:02:00, NA, NA, NA, NA, …
## $ quoted_source           <chr> NA, NA, "Twitter Media Studio", NA, NA, NA, N…
## $ quoted_favorite_count   <int> NA, NA, 2387, NA, NA, NA, NA, NA, 35, NA
## $ quoted_retweet_count    <int> NA, NA, 247, NA, NA, NA, NA, NA, 9, NA
## $ quoted_user_id          <chr> NA, NA, "847154816593752064", NA, NA, NA, NA, …
## $ quoted_screen_name      <chr> NA, NA, "MaiselTV", NA, NA, NA, NA, NA, "UCLA…
## $ quoted_name             <chr> NA, NA, "The Marvelous Mrs. Maisel", NA, NA, …
## $ quoted_followers_count  <int> NA, NA, 96318, NA, NA, NA, NA, NA, 17827, NA
## $ quoted_friends_count    <int> NA, NA, 123, NA, NA, NA, NA, NA, 1134, NA
## $ quoted_statuses_count   <int> NA, NA, 36087, NA, NA, NA, NA, NA, 7060, NA
## $ quoted_location         <chr> NA, NA, "", NA, NA, NA, NA, NA, "Los Angeles, …
## $ quoted_description      <chr> NA, NA, "She's back, and she means business. W…
## $ quoted_verified         <lgl> NA, NA, TRUE, NA, NA, NA, NA, NA, TRUE, NA
## $ retweet_status_id       <chr> NA, NA, NA, "1502359094396600328", "1502915234…
## $ retweet_text            <chr> NA, NA, NA, "Stop by and say hello to SAG AFT…
## $ retweet_created_at      <dttm> NA, NA, NA, 2022-03-11 19:01:35, 2022-03-13 07…
## $ retweet_source          <chr> NA, NA, NA, "Twitter for iPhone", "Twitter for…
## $ retweet_favorite_count  <int> NA, NA, NA, 150, 82, 62, 150, 150, NA, NA
## $ retweet_retweet_count   <int> NA, NA, NA, 12, 7, 23, 12, 12, NA, NA
## $ retweet_user_id         <chr> NA, NA, NA, "183795890", "1216785372505657346"…
## $ retweet_screen_name     <chr> NA, NA, NA, "ZouvelosGeorge", "marvelouslukek1…
## $ retweet_name            <chr> NA, NA, NA, "George Ζουvelos", "marvelous luke…
## $ retweet_followers_count <int> NA, NA, NA, 6693, 640, 910, 6693, 6693, NA, NA
## $ retweet_friends_count   <int> NA, NA, NA, 5513, 47, 1087, 5513, 5513, NA, NA
## $ retweet_statuses_count  <int> NA, NA, NA, 24527, 10794, 345, 24527, 24527, N…
## $ retweet_location        <chr> NA, NA, NA, "New York City, U.S.A.", "", "Up i…
## $ retweet_description     <chr> NA, NA, NA, "SAG-AFTRA actor- a writer-produce…
## $ retweet_verified        <lgl> NA, NA, NA, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ place_url               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ place_name              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ place_full_name         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ place_type              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ country                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ country_code            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ geo_coords              <list> <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, N…
## $ coords_coords           <list> <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, N…
## $ bbox_coords             <list> <NA, NA, NA, NA, NA, NA, NA, NA>, <NA, NA, NA,…
## $ status_url              <chr> "https://twitter.com/jasonfrazier222/status/15…
## $ name                    <chr> "Jason Frazier", "Rickey Lamitie", "marvelous …
## $ location                <chr> "Los Angeles, CA", "Vail Colorado", "", "Virg…
## $ description             <chr> "🎙🎧🏳️‍🌈 Erk from Nintendo's Fire Emblem Heroes…
## $ url                     <chr> "https://t.co/pDmvePAol8", NA, "https://t.co/E…
## $ protected               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ followers_count         <int> 165, 16, 640, 709, 2766, 1188, 1616, 11192, 36…
## $ friends_count           <int> 237, 58, 47, 602, 495, 1945, 5002, 10895, 273,…
## $ listed_count            <int> 0, 0, 0, 3, 22, 2, 2, 698, 15, 0
## $ statuses_count          <int> 483, 297, 10794, 73211, 2725, 33720, 11538, 62…
## $ favourites_count        <int> 1303, 82, 18843, 68139, 7392, 116524, 16065, 1…
## $ account_created_at      <dttm> 2021-04-04 05:52:13, 2011-07-18 02:24:32, 2020…
## $ verified                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ profile_url             <chr> "https://t.co/pDmvePAol8", NA, "https://t.co/E…
## $ profile_expanded_url    <chr> "http://www.jasonfraziervo.com", NA, "http://i…
## $ account_lang            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ profile_banner_url      <chr> "https://pbs.twimg.com/profile_banners/1378586…
## $ profile_background_url  <chr> NA, "http://abs.twimg.com/images/themes/theme1…
## $ profile_image_url       <chr> "http://pbs.twimg.com/profile_images/137858666…

I’ll use screen_name, created_at, text, and location.

twts_emmy %>% 
  select(screen_name, created_at, text, location)
## # A tibble: 10 × 4
##    screen_name     created_at          text                         location    
##    <chr>           <dttm>              <chr>                        <chr>       
##  1 jasonfrazier222 2022-03-22 19:16:40 "Sometimes I think Emmy’s b… "Los Angele…
##  2 RickeyLamitie   2022-03-22 13:52:42 "Do you think that these ou… "Vail Color…
##  3 marvelouslukek1 2022-03-19 22:31:52 "I’m not sure how many Emmy… ""          
##  4 robbe931        2022-03-19 02:22:26 "Stop by and say hello to S… "Virginia, …
##  5 sumnertrac      2022-03-17 01:56:02 "About last night…\nMidge w… ""          
##  6 sahaneserseri   2022-03-16 21:32:33 "#TheMonkees: #PeterTork, #… "dünya"     
##  7 collazo_nick    2022-03-16 19:14:06 "Stop by and say hello to S… ""          
##  8 JohnApp63499667 2022-03-16 18:39:05 "Stop by and say hello to S… "Philadelph…
##  9 mickisuzette    2022-03-16 03:39:26 "Early 60's (and late 50's)… "ATL"       
## 10 GunsTony2       2022-03-15 11:47:35 "@FaithHill @FaithHill your… "Where the …

Timeline

First run the code from Datacamp.

# Extract tweets posted by the user @Cristiano
get_cris <- get_timeline("@Cristiano", n = 3200)

# View output for the first 5 columns and 10 rows
head(get_cris[,1:5], 10)
## # A tibble: 10 × 5
##    user_id   status_id           created_at          screen_name text           
##    <chr>     <chr>               <dttm>              <chr>       <chr>          
##  1 155659213 1505216503507034112 2022-03-19 16:15:54 Cristiano   Happy father’s…
##  2 155659213 1503426067494846468 2022-03-14 17:41:21 Cristiano   We are Man. Un…
##  3 155659213 1502947140808716289 2022-03-13 09:58:16 Cristiano   Always a pleas…
##  4 155659213 1498703685991514113 2022-03-01 16:56:17 Cristiano   Keep working … 
##  5 155659213 1496201706434088961 2022-02-22 19:14:19 Cristiano   Getting ready …
##  6 155659213 1495461025570861065 2022-02-20 18:11:07 Cristiano   Very important…
##  7 155659213 1493711992510681096 2022-02-15 22:21:05 Cristiano   Back on track!…
##  8 155659213 1492211702175256582 2022-02-11 18:59:28 Cristiano   Focus 🔴⚫️ 🙏… 
##  9 155659213 1491442389382471680 2022-02-09 16:02:29 Cristiano   Children are a…
## 10 155659213 1490095552243109892 2022-02-05 22:50:38 Cristiano   Life is a roll…

Tidy

Use select() to get some more useful information.

Solution

get_cris %>% 
  select(screen_name, created_at, text, location)
## # A tibble: 3,159 × 4
##    screen_name created_at          text                                 location
##    <chr>       <dttm>              <chr>                                <chr>   
##  1 Cristiano   2022-03-19 16:15:54 Happy father’s day to all👏🏽❤️🙏🏽 …     ""      
##  2 Cristiano   2022-03-14 17:41:21 We are Man. United! Let’s do this t… ""      
##  3 Cristiano   2022-03-13 09:58:16 Always a pleasure and a privilege t… ""      
##  4 Cristiano   2022-03-01 16:56:17 Keep working 💪🏽 https://t.co/a9ju…   ""      
##  5 Cristiano   2022-02-22 19:14:19 Getting ready for the return of the… ""      
##  6 Cristiano   2022-02-20 18:11:07 Very important Premier League win b… ""      
##  7 Cristiano   2022-02-15 22:21:05 Back on track! Nobody gives up and … ""      
##  8 Cristiano   2022-02-11 18:59:28 Focus 🔴⚫️ 🙏🏽 #mufc https://t.co/…   ""      
##  9 Cristiano   2022-02-09 16:02:29 Children are and will always be the… ""      
## 10 Cristiano   2022-02-05 22:50:38 Life is a roller coaster. Hard work… ""      
## # … with 3,149 more rows

First the code from Datacamp. We need to get the dataframe of tweets about artificial intelligence ourselves.

tweets_ai = search_tweets("Artificial Intelligence", 
                 n = 2000, 
                 include_rts = TRUE, 
                 lang = "en")

Now run the code from Datacamp.

# Create a table of users and tweet counts for the topic
sc_name <- table(tweets_ai$screen_name)

# Sort the table in descending order of tweet counts
sc_name_sort <- sort(sc_name, decreasing = TRUE)

# View sorted table for top 10 users
head(sc_name_sort, 10)
## 
##  richardkimphd   ML_Tweet_Bot    TechnoJeder    techguyjack SuriyaSubraman 
##             63             42             40             33             32 
##   Robert_R_Art samuel_ludwick   chidambara09         KnXChg        xaelbot 
##             31             20             17             17             15

Tidy

Do the same thing with tidyverse code.

Solution

sc_name = tweets_ai %>% 
  group_by(screen_name) %>% 
  summarize(count = n()) %>% 
  ungroup() %>% 
  arrange(desc(count))

head(sc_name)
## # A tibble: 6 × 2
##   screen_name    count
##   <chr>          <int>
## 1 richardkimphd     63
## 2 ML_Tweet_Bot      42
## 3 TechnoJeder       40
## 4 techguyjack       33
## 5 SuriyaSubraman    32
## 6 Robert_R_Art      31

Folllowers Count

Try to run the Datacamp code.

Solution

# Extract user data for the twitter accounts of 4 news sites

# The following line won't run
# users <- lookup_users("nytimes", "CNN", "FoxNews", "NBCNews")

# We need to do the following.

usrs = c("nytimes", "CNN", "FoxNews", "NBCNews")
users = lookup_users(usrs)

# Create a data frame of screen names and follower counts
user_df <- users[,c("screen_name","followers_count")]

# Display and compare the follower counts for the 4 news sites
user_df
## # A tibble: 4 × 2
##   screen_name followers_count
##   <chr>                 <int>
## 1 nytimes            52348700
## 2 CNN                57230908
## 3 FoxNews            20979059
## 4 NBCNews             9011630