tidyverse Overview: Russian Trolls

Intro

install and call packages

First, we need to get tidyverse. Recall, you only need to install once (but you need to do it for each computer).

After installing, you then need to call the library to “activate” it.

# install packages
# install.packages("tidyverse")

# call (i.e., activate) tidyverse
library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

load csv file

Next, let’s load the csv file from the url.

url <- "http://nodeassets.nbcnews.com/russian-twitter-trolls/tweets.csv"
tweets <- read_csv(url)

## Parsed with column specification:
## cols(
##   user_id = col_double(),
##   user_key = col_character(),
##   created_at = col_double(),
##   created_str = col_datetime(format = ""),
##   retweet_count = col_integer(),
##   retweeted = col_character(),
##   favorite_count = col_integer(),
##   text = col_character(),
##   tweet_id = col_double(),
##   source = col_character(),
##   hashtags = col_character(),
##   expanded_urls = col_character(),
##   posted = col_character(),
##   mentions = col_character(),
##   retweeted_status_id = col_double(),
##   in_reply_to_status_id = col_double()
## )

Count Function

With the dataframe, let’s use the count() function to count the number of tweets per user (user_key).

# count function
count(tweets, user_key)

## # A tibble: 453 x 2
##    user_key          n
##    <chr>         <int>
##  1 4ever1937         6
##  2 4mysquad         43
##  3 6druz             5
##  4 aantiracist      37
##  5 abigailssilk     17
##  6 acejinev         65
##  7 adamchapmanjr    38
##  8 adrgreerr        13
##  9 adrienne_gg      40
## 10 agnesgrhm         1
## # ... with 443 more rows

# assignment operator
c <- count(tweets, user_key)

# piping
c2 <- tweets %>%
  count(user_key)

# group_by and summarise
c3 <- tweets %>%
  group_by(user_key) %>%
  summarise(Count=n())

Question: What is the difference between these three dataframes?

What were the top tweets by retweet and favorite (likes) counts?

Let’s use other dplyr functions to examine the top retweets…

tweets %>%
  arrange(desc(retweet_count)) %>%
  select(user_key, created_str, text, retweet_count)

## # A tibble: 203,451 x 4
##    user_key        created_str         text                  retweet_count
##    <chr>           <dttm>              <chr>                         <int>
##  1 trayneshacole   2016-11-17 20:27:21 go all the way off h…         20494
##  2 gloed_up        2016-03-12 19:50:32 "#AnthonyCage was bl…         18209
##  3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly be…         13636
##  4 ten_gop         2016-10-11 02:11:02 "OMG, this new Anti-…         12078
##  5 ten_gop         2016-10-18 18:44:14 "RT the hell out of …         12042
##  6 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the on…         11616
##  7 jenn_abrams     2016-09-18 16:42:48 "Some guy right in H…         11363
##  8 crystal1johnson 2016-10-09 01:27:00 3 Black children car…         10823
##  9 ten_gop         2016-10-20 05:40:07 "BREAKING🚨 \n\nHilla…         10467
## 10 bleepthepolice  2016-09-21 15:37:27 "Wow. Hadn't thought…         10310
## # ... with 203,441 more rows

… and most favorited (liked) tweets.

tweets %>%
  arrange(desc(favorite_count)) %>%
  select(user_key, created_str, text, favorite_count)

## # A tibble: 203,451 x 4
##    user_key        created_str         text                 favorite_count
##    <chr>           <dttm>              <chr>                         <int>
##  1 trayneshacole   2016-11-17 20:27:21 go all the way off …          26655
##  2 jenn_abrams     2016-09-18 16:42:48 "Some guy right in …          16068
##  3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly b…          12815
##  4 ten_gop         2016-10-11 02:11:02 "OMG, this new Anti…          10867
##  5 ten_gop         2016-10-20 05:40:07 "BREAKING🚨 \n\nHill…          10437
##  6 crystal1johnson 2016-10-09 01:27:00 3 Black children ca…          10340
##  7 trayneshacole   2016-11-10 22:50:19 "White boyfriend sh…           9265
##  8 pamela_moore13  2016-06-16 23:35:47 "I would rather tak…           9242
##  9 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the o…           8969
## 10 bleepthepolice  2016-09-21 15:37:27 "Wow. Hadn't though…           8495
## # ... with 203,441 more rows

We can then use ggplot to count the number of tweets per day.

# ggplot
sum <- tweets %>%
  group_by(Date = as.Date(created_str)) %>% # convert string to Date format
  summarise(Count = n())

# create ggplot
ggplot(data = sum, aes(x = Date, y = Count)) +
  geom_line()

tweets %>%
  group_by(Date = as.Date(created_str)) %>%
  summarise(Count = n())

## # A tibble: 931 x 2
##    Date       Count
##    <date>     <int>
##  1 2014-07-14     1
##  2 2014-07-17     1
##  3 2014-07-20     2
##  4 2014-07-22     3
##  5 2014-07-23     1
##  6 2014-07-25     1
##  7 2014-07-26     1
##  8 2014-07-28     1
##  9 2014-07-29     1
## 10 2014-08-04     1
## # ... with 921 more rows

ggplot(data = sum, aes(x = Date, y = Count)) +
  geom_line() +
  geom_point() +
  labs(title = "Daily Tweet Counts by Russian Trolls",
       x = "Time (Day)",
       y = "Number of Tweets",
       caption = "Source: NBC News")

We can also save the plot to an object (m).

# saving as an object
m <- ggplot(data = sum, aes(x = Date, y = Count)) +
  geom_line() +
  geom_point()

# you can manually call the object
m

With that object, you can add new layers to the plot.

# you can add layers
m + labs(title = "Daily Tweet Counts by Russian Trolls",
         x = "Time (Day)",
         y = "Number of Tweets",
         caption = "Source: NBC News")

Stringr to filter text

Let’s use the stringr package to filter tweets that mention either #keithlamontscott or Charlotte (| is the “OR” operator).

# use stringr for text filter -- part of the tidyverse 
library(stringr)

term <- "#keithlamontscott|charlotte"

q <- tweets %>%
  filter(str_detect(tolower(tweets$text), tolower(term)))

Add in full pipeline

Let’s now link the data

# with full pipeline
term <- "obama"

q <- tweets %>%
  filter(str_detect(tolower(tweets$text), term)) %>%
  group_by(Date = as.Date(created_str)) %>%
  summarise(Count=n()) %>%
  ggplot(aes(x = Date, y = Count)) +
  geom_line() +
  labs(title = paste0("Tweets filtered by `",term,"`"),
  x = "Day",
  y = "Number of Tweets")

q

first html widget: plotly

Last, let’s use the plotly package to make this ggplot interactive.

# install.packages("plotly")
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

ggplotly(q)