Setup

library(plyr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::arrange()   masks plyr::arrange()
## x purrr::compact()   masks plyr::compact()
## x dplyr::count()     masks plyr::count()
## x dplyr::failwith()  masks plyr::failwith()
## x dplyr::filter()    masks stats::filter()
## x dplyr::id()        masks plyr::id()
## x dplyr::lag()       masks stats::lag()
## x dplyr::mutate()    masks plyr::mutate()
## x dplyr::rename()    masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()

library(rtweet)

## 
## Attaching package: 'rtweet'

## The following object is masked from 'package:purrr':
## 
##     flatten

library(httpuv)

Compare #rstats and #pandas

Compare the two hashtags on the rate of posting. Use n = 500.

Solution

time_to_get = function(term,ntweets){
  search_tweets(term, n = ntweets) %>% 
  filter(is.na(reply_to_screen_name)  &
         is_retweet == FALSE &
         is_quote == FALSE) %>% 
  summarize(max(created_at) - min(created_at))
  
}

time_to_get("#rstats", 500)

## # A tibble: 1 × 1
##   `max(created_at) - min(created_at)`
##   <drtn>                             
## 1 1.910278 hours

time_to_get("#pandas",500)

## # A tibble: 1 × 1
##   `max(created_at) - min(created_at)`
##   <drtn>                             
## 1 20.35806 hours

Compare Originality

Use fract_original() to compare the originality of the postings for these two hastags.

Solution

fract_original = function(term,ntweets){
  search_tweets(term,n = ntweets) %>% 
  mutate(is_orig = is.na(reply_to_screen_name)  &
         is_retweet == FALSE &
         is_quote == FALSE) %>%
  summarize(mean(is_orig))
}

fract_original("#rstats",500)

## # A tibble: 1 × 1
##   `mean(is_orig)`
##             <dbl>
## 1           0.170

fract_original("#pandas",500)

## # A tibble: 1 × 1
##   `mean(is_orig)`
##             <dbl>
## 1           0.119

Language Distribution

Examine the distribution of languages for these two hashtags. Use n = 2000.

search_tweets("rstats",n = 2000) %>% 
  ggplot(aes(x = lang)) +
  geom_bar() +
  ggtitle("Language Distribution for #rstats")

search_tweets("pandas",n = 2000) %>% 
  ggplot(aes(x = lang)) +
  geom_bar() +
  ggtitle("Language Distribution for #pandas")

Timelines

Use get_timeline() to get and display the followers_count, friends_count and the golden ratio for the following screen names.

@HadleyWickham
@juliasilge
@wesmckinn

Solution

tl = get_timeline(c("@HadleyWickham","@juliasilge","@wesmckinn
"), n = 100)

tl %>% 
  group_by(screen_name) %>% 
  summarize(followers = mean(followers_count),
            friends = mean(friends_count)) %>% 
  ungroup() %>% 
  mutate(golden_ratio = followers/friends)

## # A tibble: 3 × 4
##   screen_name   followers friends golden_ratio
##   <chr>             <dbl>   <dbl>        <dbl>
## 1 hadleywickham    131668     283        465. 
## 2 juliasilge        45087     754         59.8
## 3 wesmckinn         56228     872         64.5

Trends

Get the current trending topics in New York. Display the top 10.

Solution

gtny = get_trends("New York")

gtny %>% arrange(desc(tweet_volume)) %>% 
  select(query,tweet_volume) %>% 
  distinct %>% 
  head(10)

## # A tibble: 10 × 2
##    query            tweet_volume
##    <chr>                   <int>
##  1 %23Oscars              586112
##  2 Beyonc%C3%A9           146162
##  3 Duke                   121845
##  4 %22World+Cup%22        105820
##  5 Zendaya                 98637
##  6 Hollywood               85457
##  7 %22Final+Four%22        81408
##  8 Dune                    80119
##  9 Chanel                  58026
## 10 Kansas                  53529

Huh??

I found a topic that I knew nothing about in my results. At that time it was “SaudiArabianGP”. How would I look at the text of some of the tweets on that topic. Do the same thing for something you don’t understand.

Solution

Maybe you won’t see this today.

search_tweets("SaudiArabianGP",n = 20) %>% 
  select(text)

## # A tibble: 20 × 1
##    text                                                                         
##    <chr>                                                                        
##  1 "#F1 | サウジアラビアGP🇸🇦\n決勝速報📣\n\n優勝 フェルスタッペン🚀🚀\n4位 ペレ…
##  2 "VERSTAPPEN WINS IN JEDDAH!! 🏆\n\nLeclerc finishes just behind in second, S…
##  3 "@CoachDonMega @SaudiArabianGP @fia @F1 The sound of your barking is loud an…
##  4 "Verstappen:\"It was a tricky one. One the mediums again. Just not a lot of …
##  5 "Verstappen:\"The VSC came out and you dont know which car is going to come …
##  6 "¡Muy cerca han estado los Alpine de tocarse! \n🟣https://t.co/UsNqEo9XqV\n… 
##  7 "Se anota infracción del procedimiento de safety car de Pérez.\n\nSainz: \"Y…
##  8 "¡¡PROBLEMAS para Tsunoda!!\n\n\"He perdido el motor. He perdido el motor\".…
##  9 "Verstappen contra Leclerc, round 2 🍿 #SaudiArabianGP https://t.co/SFhKfKAb…
## 10 "\"UNA BUENA CARRERA PARA 'CHECO' EN CUANTO A LA SUMA DE PUNTOS\" \n\n#F1xFO…
## 11 "📌| مشهد الختام 🏎\n#فورمولا1_في_السعودية\n#SaudiaArabianGP\n https://t.co/o…
## 12 "Thumbs up all round! 👍👍\n\n#SaudiArabianGP #F1 https://t.co/E4eNCKeSnN"   
## 13 "YES BOYS 🙌 P1☝️\n\nFeels amazing to be on top after such an exciting race … 
## 14 "Thumbs up all round! 👍👍\n\n#SaudiArabianGP #F1 https://t.co/E4eNCKeSnN"   
## 15 "Leclerc leads after two rounds \n\n#SaudiArabianGP #F1 https://t.co/JHck65s…
## 16 "The #DHLFastestPitStop of the #SaudiArabianGP goes to @McLarenF1 for the se…
## 17 "#SaudiArabianGP  Saudiarabia land full of gay princes that are fucking arou…
## 18 "An incredible battle, from start to finish 👊\n\nThis, is why we love Formu…
## 19 "#F1 🇸🇦 #SaudiArabianGP  - Carrera\n\n🎙️#Perez \"La entrada del SC, fue el pe…
## 20 "YES BOYS 🙌 P1☝️\n\nFeels amazing to be on top after such an exciting race …

Time Series Data

Here is the code from the course.

# Extract tweets on #walmart and exclude retweets
walmart_twts <- search_tweets("#walmart", n = 18000, include_rts = FALSE,retryonratelimit = TRUE)

## retry on rate limit...
## waiting about 12 minutes...

# View the output
head(walmart_twts)

## # A tibble: 6 × 90
##   user_id   status_id           created_at          screen_name text      source
##   <chr>     <chr>               <dttm>              <chr>       <chr>     <chr> 
## 1 25473002  1508236674811109379 2022-03-28 00:16:59 rakum       "There i… Twitt…
## 2 309912680 1508236404823887874 2022-03-28 00:15:55 Annie_Acorn "Two Res… Twitt…
## 3 309912680 1508236171700277255 2022-03-28 00:14:59 Annie_Acorn "Gotten … Twitt…
## 4 309912680 1505695094971711491 2022-03-20 23:57:39 Annie_Acorn "MY NEWE… Twitt…
## 5 309912680 1506726322894221313 2022-03-23 20:15:23 Annie_Acorn "Do YOU … Twitt…
## 6 309912680 1506725548004986883 2022-03-23 20:12:18 Annie_Acorn "Can a s… Twitt…
## # … with 84 more variables: display_text_width <dbl>, reply_to_status_id <chr>,
## #   reply_to_user_id <chr>, reply_to_screen_name <chr>, is_quote <lgl>,
## #   is_retweet <lgl>, favorite_count <int>, retweet_count <int>,
## #   quote_count <int>, reply_count <int>, hashtags <list>, symbols <list>,
## #   urls_url <list>, urls_t.co <list>, urls_expanded_url <list>,
## #   media_url <list>, media_t.co <list>, media_expanded_url <list>,
## #   media_type <list>, ext_media_url <list>, ext_media_t.co <list>, …

# Create a time series plot
ts_plot(walmart_twts, by = "hours", color = "blue")

Periodicity

Use the hour() and wday() functions from the lubridate package to see how these tweets vary over a week.

Solution

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

walmart_twts = walmart_twts %>% 
select(created_at) %>% 
mutate(day = wday(created_at,label = TRUE,abbr = TRUE),
       hour = factor(hour(created_at)))

walmart_twts %>% 
  ggplot(aes(x = hour, y = day)) +
  geom_count() +
  ggtitle("Walmart Tweets by Day of Week and Hour")

Can you explain what you see?

Twitter 3

Setup

Compare #rstats and #pandas

Solution

Compare Originality

Solution

Language Distribution

Timelines

Solution

Trends

Solution

Huh??

Solution

Time Series Data

Periodicity

Solution