01-russian-trolls.utf8.md

# install packages
# install.packages("tidyverse")

# call (i.e., activate) tidyverse
library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

# load csv file
url <- "http://nodeassets.nbcnews.com/russian-twitter-trolls/tweets.csv"
tweets <- read_csv(url)

## Parsed with column specification:
## cols(
##   user_id = col_double(),
##   user_key = col_character(),
##   created_at = col_double(),
##   created_str = col_datetime(format = ""),
##   retweet_count = col_integer(),
##   retweeted = col_character(),
##   favorite_count = col_integer(),
##   text = col_character(),
##   tweet_id = col_double(),
##   source = col_character(),
##   hashtags = col_character(),
##   expanded_urls = col_character(),
##   posted = col_character(),
##   mentions = col_character(),
##   retweeted_status_id = col_double(),
##   in_reply_to_status_id = col_double()
## )

# functions in tidyverse
count(tweets, user_key)

## # A tibble: 453 x 2
##    user_key          n
##    <chr>         <int>
##  1 4ever1937         6
##  2 4mysquad         43
##  3 6druz             5
##  4 aantiracist      37
##  5 abigailssilk     17
##  6 acejinev         65
##  7 adamchapmanjr    38
##  8 adrgreerr        13
##  9 adrienne_gg      40
## 10 agnesgrhm         1
## # ... with 443 more rows

# assignment operator
c <- count(tweets, user_key)

# piping
c2 <- tweets %>%
  count(user_key)

# group_by and summarise
c3 <- tweets %>%
  group_by(user_key) %>%
  summarise(Count=n())

# what is the difference between these three dataframes?


### What were the top tweets by retweet and favorite (likes) counts?

tweets %>%
  arrange(desc(retweet_count)) %>%
  select(user_key, created_str, text, retweet_count)

## # A tibble: 203,451 x 4
##    user_key        created_str         text                  retweet_count
##    <chr>           <dttm>              <chr>                         <int>
##  1 trayneshacole   2016-11-17 20:27:21 go all the way off h…         20494
##  2 gloed_up        2016-03-12 19:50:32 "#AnthonyCage was bl…         18209
##  3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly be…         13636
##  4 ten_gop         2016-10-11 02:11:02 "OMG, this new Anti-…         12078
##  5 ten_gop         2016-10-18 18:44:14 "RT the hell out of …         12042
##  6 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the on…         11616
##  7 jenn_abrams     2016-09-18 16:42:48 "Some guy right in H…         11363
##  8 crystal1johnson 2016-10-09 01:27:00 3 Black children car…         10823
##  9 ten_gop         2016-10-20 05:40:07 "BREAKING🚨 \n\nHilla…         10467
## 10 bleepthepolice  2016-09-21 15:37:27 "Wow. Hadn't thought…         10310
## # ... with 203,441 more rows

tweets %>%
  arrange(desc(favorite_count)) %>%
  select(user_key, created_str, text, favorite_count)

## # A tibble: 203,451 x 4
##    user_key        created_str         text                 favorite_count
##    <chr>           <dttm>              <chr>                         <int>
##  1 trayneshacole   2016-11-17 20:27:21 go all the way off …          26655
##  2 jenn_abrams     2016-09-18 16:42:48 "Some guy right in …          16068
##  3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly b…          12815
##  4 ten_gop         2016-10-11 02:11:02 "OMG, this new Anti…          10867
##  5 ten_gop         2016-10-20 05:40:07 "BREAKING🚨 \n\nHill…          10437
##  6 crystal1johnson 2016-10-09 01:27:00 3 Black children ca…          10340
##  7 trayneshacole   2016-11-10 22:50:19 "White boyfriend sh…           9265
##  8 pamela_moore13  2016-06-16 23:35:47 "I would rather tak…           9242
##  9 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the o…           8969
## 10 bleepthepolice  2016-09-21 15:37:27 "Wow. Hadn't though…           8495
## # ... with 203,441 more rows

# ggplot
sum <- tweets %>%
  group_by(Date = as.Date(created_str)) %>%
  summarise(Count = n())

# create ggplot
ggplot(data = sum, aes(x = Date, y = Count)) +
  geom_line()

## Warning: Removed 1 rows containing missing values (geom_path).

tweets %>%
  group_by(Date = as.Date(created_str)) %>%
  summarise(Count = n())

## # A tibble: 931 x 2
##    Date       Count
##    <date>     <int>
##  1 2014-07-14     1
##  2 2014-07-17     1
##  3 2014-07-20     2
##  4 2014-07-22     3
##  5 2014-07-23     1
##  6 2014-07-25     1
##  7 2014-07-26     1
##  8 2014-07-28     1
##  9 2014-07-29     1
## 10 2014-08-04     1
## # ... with 921 more rows

ggplot(data = sum, aes(x = Date, y = Count)) +
  geom_line() +
  geom_point() +
  labs(title = "Daily Tweet Counts by Russian Trolls",
       x = "Time (Day)",
       y = "Number of Tweets",
       caption = "Source: NBC News")

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

# saving as an object
m <- ggplot(data = sum, aes(x = Date, y = Count)) +
  geom_line() +
  geom_point()

# you can manually call the object
m

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

# you can e
m + labs(title = "Daily Tweet Counts by Russian Trolls",
         x = "Time (Day)",
         y = "Number of Tweets",
         caption = "Source: NBC News")

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

# use stringr for text filter -- part of the tidyverse 
library(stringr)

term <- "keithlamontscott|charlotte"

q <- tweets %>%
  filter(str_detect(tolower(tweets$text), term))

# with full pipeline
term <- "obama"

q <- tweets %>%
  filter(str_detect(tolower(tweets$text), term)) %>%
  group_by(Date = as.Date(created_str)) %>%
  summarise(Count=n()) %>%
  ggplot(aes(x = Date, y = Count)) +
  geom_line() +
  labs(title = paste0("Tweets filtered by `",term,"`"),
  x = "Day",
  y = "Number of Tweets")

q

## first html widget: plotly

# install.packages("plotly")
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

ggplotly(q)

01-russian-trolls.R

whitesox

Wed Jun 6 14:04:45 2018