# install packages
# install.packages("tidyverse")
# call (i.e., activate) tidyverse
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
# load csv file
url <- "http://nodeassets.nbcnews.com/russian-twitter-trolls/tweets.csv"
tweets <- read_csv(url)
## Parsed with column specification:
## cols(
## user_id = col_double(),
## user_key = col_character(),
## created_at = col_double(),
## created_str = col_datetime(format = ""),
## retweet_count = col_integer(),
## retweeted = col_character(),
## favorite_count = col_integer(),
## text = col_character(),
## tweet_id = col_double(),
## source = col_character(),
## hashtags = col_character(),
## expanded_urls = col_character(),
## posted = col_character(),
## mentions = col_character(),
## retweeted_status_id = col_double(),
## in_reply_to_status_id = col_double()
## )
# functions in tidyverse
count(tweets, user_key)
## # A tibble: 453 x 2
## user_key n
## <chr> <int>
## 1 4ever1937 6
## 2 4mysquad 43
## 3 6druz 5
## 4 aantiracist 37
## 5 abigailssilk 17
## 6 acejinev 65
## 7 adamchapmanjr 38
## 8 adrgreerr 13
## 9 adrienne_gg 40
## 10 agnesgrhm 1
## # ... with 443 more rows
# assignment operator
c <- count(tweets, user_key)
# piping
c2 <- tweets %>%
count(user_key)
# group_by and summarise
c3 <- tweets %>%
group_by(user_key) %>%
summarise(Count=n())
# what is the difference between these three dataframes?
### What were the top tweets by retweet and favorite (likes) counts?
tweets %>%
arrange(desc(retweet_count)) %>%
select(user_key, created_str, text, retweet_count)
## # A tibble: 203,451 x 4
## user_key created_str text retweet_count
## <chr> <dttm> <chr> <int>
## 1 trayneshacole 2016-11-17 20:27:21 go all the way off h… 20494
## 2 gloed_up 2016-03-12 19:50:32 "#AnthonyCage was bl… 18209
## 3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly be… 13636
## 4 ten_gop 2016-10-11 02:11:02 "OMG, this new Anti-… 12078
## 5 ten_gop 2016-10-18 18:44:14 "RT the hell out of … 12042
## 6 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the on… 11616
## 7 jenn_abrams 2016-09-18 16:42:48 "Some guy right in H… 11363
## 8 crystal1johnson 2016-10-09 01:27:00 3 Black children car… 10823
## 9 ten_gop 2016-10-20 05:40:07 "BREAKING🚨 \n\nHilla… 10467
## 10 bleepthepolice 2016-09-21 15:37:27 "Wow. Hadn't thought… 10310
## # ... with 203,441 more rows
tweets %>%
arrange(desc(favorite_count)) %>%
select(user_key, created_str, text, favorite_count)
## # A tibble: 203,451 x 4
## user_key created_str text favorite_count
## <chr> <dttm> <chr> <int>
## 1 trayneshacole 2016-11-17 20:27:21 go all the way off … 26655
## 2 jenn_abrams 2016-09-18 16:42:48 "Some guy right in … 16068
## 3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly b… 12815
## 4 ten_gop 2016-10-11 02:11:02 "OMG, this new Anti… 10867
## 5 ten_gop 2016-10-20 05:40:07 "BREAKING🚨 \n\nHill… 10437
## 6 crystal1johnson 2016-10-09 01:27:00 3 Black children ca… 10340
## 7 trayneshacole 2016-11-10 22:50:19 "White boyfriend sh… 9265
## 8 pamela_moore13 2016-06-16 23:35:47 "I would rather tak… 9242
## 9 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the o… 8969
## 10 bleepthepolice 2016-09-21 15:37:27 "Wow. Hadn't though… 8495
## # ... with 203,441 more rows
# ggplot
sum <- tweets %>%
group_by(Date = as.Date(created_str)) %>%
summarise(Count = n())
# create ggplot
ggplot(data = sum, aes(x = Date, y = Count)) +
geom_line()
## Warning: Removed 1 rows containing missing values (geom_path).

tweets %>%
group_by(Date = as.Date(created_str)) %>%
summarise(Count = n())
## # A tibble: 931 x 2
## Date Count
## <date> <int>
## 1 2014-07-14 1
## 2 2014-07-17 1
## 3 2014-07-20 2
## 4 2014-07-22 3
## 5 2014-07-23 1
## 6 2014-07-25 1
## 7 2014-07-26 1
## 8 2014-07-28 1
## 9 2014-07-29 1
## 10 2014-08-04 1
## # ... with 921 more rows
ggplot(data = sum, aes(x = Date, y = Count)) +
geom_line() +
geom_point() +
labs(title = "Daily Tweet Counts by Russian Trolls",
x = "Time (Day)",
y = "Number of Tweets",
caption = "Source: NBC News")
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

# saving as an object
m <- ggplot(data = sum, aes(x = Date, y = Count)) +
geom_line() +
geom_point()
# you can manually call the object
m
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

# you can e
m + labs(title = "Daily Tweet Counts by Russian Trolls",
x = "Time (Day)",
y = "Number of Tweets",
caption = "Source: NBC News")
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

# use stringr for text filter -- part of the tidyverse
library(stringr)
term <- "keithlamontscott|charlotte"
q <- tweets %>%
filter(str_detect(tolower(tweets$text), term))
# with full pipeline
term <- "obama"
q <- tweets %>%
filter(str_detect(tolower(tweets$text), term)) %>%
group_by(Date = as.Date(created_str)) %>%
summarise(Count=n()) %>%
ggplot(aes(x = Date, y = Count)) +
geom_line() +
labs(title = paste0("Tweets filtered by `",term,"`"),
x = "Day",
y = "Number of Tweets")
q

## first html widget: plotly
# install.packages("plotly")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(q)