First, we need to get tidyverse
. Recall, you only need to install once (but you need to do it for each computer).
After installing, you then need to call the library to “activate” it.
# install packages
# install.packages("tidyverse")
# call (i.e., activate) tidyverse
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
Next, let’s load the csv file from the url.
url <- "http://nodeassets.nbcnews.com/russian-twitter-trolls/tweets.csv"
tweets <- read_csv(url)
## Parsed with column specification:
## cols(
## user_id = col_double(),
## user_key = col_character(),
## created_at = col_double(),
## created_str = col_datetime(format = ""),
## retweet_count = col_integer(),
## retweeted = col_character(),
## favorite_count = col_integer(),
## text = col_character(),
## tweet_id = col_double(),
## source = col_character(),
## hashtags = col_character(),
## expanded_urls = col_character(),
## posted = col_character(),
## mentions = col_character(),
## retweeted_status_id = col_double(),
## in_reply_to_status_id = col_double()
## )
With the dataframe, let’s use the count()
function to count the number of tweets per user (user_key).
# count function
count(tweets, user_key)
## # A tibble: 453 x 2
## user_key n
## <chr> <int>
## 1 4ever1937 6
## 2 4mysquad 43
## 3 6druz 5
## 4 aantiracist 37
## 5 abigailssilk 17
## 6 acejinev 65
## 7 adamchapmanjr 38
## 8 adrgreerr 13
## 9 adrienne_gg 40
## 10 agnesgrhm 1
## # ... with 443 more rows
# assignment operator
c <- count(tweets, user_key)
# piping
c2 <- tweets %>%
count(user_key)
# group_by and summarise
c3 <- tweets %>%
group_by(user_key) %>%
summarise(Count=n())
Question: What is the difference between these three dataframes?
Let’s use other dplyr
functions to examine the top retweets…
tweets %>%
arrange(desc(retweet_count)) %>%
select(user_key, created_str, text, retweet_count)
## # A tibble: 203,451 x 4
## user_key created_str text retweet_count
## <chr> <dttm> <chr> <int>
## 1 trayneshacole 2016-11-17 20:27:21 go all the way off h… 20494
## 2 gloed_up 2016-03-12 19:50:32 "#AnthonyCage was bl… 18209
## 3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly be… 13636
## 4 ten_gop 2016-10-11 02:11:02 "OMG, this new Anti-… 12078
## 5 ten_gop 2016-10-18 18:44:14 "RT the hell out of … 12042
## 6 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the on… 11616
## 7 jenn_abrams 2016-09-18 16:42:48 "Some guy right in H… 11363
## 8 crystal1johnson 2016-10-09 01:27:00 3 Black children car… 10823
## 9 ten_gop 2016-10-20 05:40:07 "BREAKING🚨 \n\nHilla… 10467
## 10 bleepthepolice 2016-09-21 15:37:27 "Wow. Hadn't thought… 10310
## # ... with 203,441 more rows
… and most favorited (liked) tweets.
tweets %>%
arrange(desc(favorite_count)) %>%
select(user_key, created_str, text, favorite_count)
## # A tibble: 203,451 x 4
## user_key created_str text favorite_count
## <chr> <dttm> <chr> <int>
## 1 trayneshacole 2016-11-17 20:27:21 go all the way off … 26655
## 2 jenn_abrams 2016-09-18 16:42:48 "Some guy right in … 16068
## 3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly b… 12815
## 4 ten_gop 2016-10-11 02:11:02 "OMG, this new Anti… 10867
## 5 ten_gop 2016-10-20 05:40:07 "BREAKING🚨 \n\nHill… 10437
## 6 crystal1johnson 2016-10-09 01:27:00 3 Black children ca… 10340
## 7 trayneshacole 2016-11-10 22:50:19 "White boyfriend sh… 9265
## 8 pamela_moore13 2016-06-16 23:35:47 "I would rather tak… 9242
## 9 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the o… 8969
## 10 bleepthepolice 2016-09-21 15:37:27 "Wow. Hadn't though… 8495
## # ... with 203,441 more rows
We can then use ggplot to count the number of tweets per day.
# ggplot
sum <- tweets %>%
group_by(Date = as.Date(created_str)) %>% # convert string to Date format
summarise(Count = n())
# create ggplot
ggplot(data = sum, aes(x = Date, y = Count)) +
geom_line()
tweets %>%
group_by(Date = as.Date(created_str)) %>%
summarise(Count = n())
## # A tibble: 931 x 2
## Date Count
## <date> <int>
## 1 2014-07-14 1
## 2 2014-07-17 1
## 3 2014-07-20 2
## 4 2014-07-22 3
## 5 2014-07-23 1
## 6 2014-07-25 1
## 7 2014-07-26 1
## 8 2014-07-28 1
## 9 2014-07-29 1
## 10 2014-08-04 1
## # ... with 921 more rows
ggplot(data = sum, aes(x = Date, y = Count)) +
geom_line() +
geom_point() +
labs(title = "Daily Tweet Counts by Russian Trolls",
x = "Time (Day)",
y = "Number of Tweets",
caption = "Source: NBC News")
We can also save the plot to an object (m).
# saving as an object
m <- ggplot(data = sum, aes(x = Date, y = Count)) +
geom_line() +
geom_point()
# you can manually call the object
m
With that object, you can add new layers to the plot.
# you can add layers
m + labs(title = "Daily Tweet Counts by Russian Trolls",
x = "Time (Day)",
y = "Number of Tweets",
caption = "Source: NBC News")
Let’s use the stringr
package to filter tweets that mention either #keithlamontscott or Charlotte (| is the “OR” operator).
# use stringr for text filter -- part of the tidyverse
library(stringr)
term <- "#keithlamontscott|charlotte"
q <- tweets %>%
filter(str_detect(tolower(tweets$text), tolower(term)))
Let’s now link the data
# with full pipeline
term <- "obama"
q <- tweets %>%
filter(str_detect(tolower(tweets$text), term)) %>%
group_by(Date = as.Date(created_str)) %>%
summarise(Count=n()) %>%
ggplot(aes(x = Date, y = Count)) +
geom_line() +
labs(title = paste0("Tweets filtered by `",term,"`"),
x = "Day",
y = "Number of Tweets")
q
Last, let’s use the plotly
package to make this ggplot interactive.
# install.packages("plotly")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(q)