First, we need to get tidyverse. Recall, you only need to install once (but you need to do it for each computer).
After installing, you then need to call the library to “activate” it.
# install packages
# install.packages("tidyverse")
# call (i.e., activate) tidyverse
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
Next, let’s load the csv file from the url.
url <- "http://nodeassets.nbcnews.com/russian-twitter-trolls/tweets.csv"
tweets <- read_csv(url)
## Parsed with column specification:
## cols(
## user_id = col_double(),
## user_key = col_character(),
## created_at = col_double(),
## created_str = col_datetime(format = ""),
## retweet_count = col_integer(),
## retweeted = col_character(),
## favorite_count = col_integer(),
## text = col_character(),
## tweet_id = col_double(),
## source = col_character(),
## hashtags = col_character(),
## expanded_urls = col_character(),
## posted = col_character(),
## mentions = col_character(),
## retweeted_status_id = col_double(),
## in_reply_to_status_id = col_double()
## )
With the dataframe, let’s use the count() function to count the number of tweets per user (user_key).
# count function
count(tweets, user_key)
## # A tibble: 453 x 2
## user_key n
## <chr> <int>
## 1 4ever1937 6
## 2 4mysquad 43
## 3 6druz 5
## 4 aantiracist 37
## 5 abigailssilk 17
## 6 acejinev 65
## 7 adamchapmanjr 38
## 8 adrgreerr 13
## 9 adrienne_gg 40
## 10 agnesgrhm 1
## # ... with 443 more rows
# assignment operator
c <- count(tweets, user_key)
# piping
c2 <- tweets %>%
count(user_key)
# group_by and summarise
c3 <- tweets %>%
group_by(user_key) %>%
summarise(Count=n())
Question: What is the difference between these three dataframes?
Let’s use other dplyr functions to examine the top retweets…
tweets %>%
arrange(desc(retweet_count)) %>%
select(user_key, created_str, text, retweet_count)
## # A tibble: 203,451 x 4
## user_key created_str text retweet_count
## <chr> <dttm> <chr> <int>
## 1 trayneshacole 2016-11-17 20:27:21 go all the way off h… 20494
## 2 gloed_up 2016-03-12 19:50:32 "#AnthonyCage was bl… 18209
## 3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly be… 13636
## 4 ten_gop 2016-10-11 02:11:02 "OMG, this new Anti-… 12078
## 5 ten_gop 2016-10-18 18:44:14 "RT the hell out of … 12042
## 6 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the on… 11616
## 7 jenn_abrams 2016-09-18 16:42:48 "Some guy right in H… 11363
## 8 crystal1johnson 2016-10-09 01:27:00 3 Black children car… 10823
## 9 ten_gop 2016-10-20 05:40:07 "BREAKING🚨 \n\nHilla… 10467
## 10 bleepthepolice 2016-09-21 15:37:27 "Wow. Hadn't thought… 10310
## # ... with 203,441 more rows
… and most favorited (liked) tweets.
tweets %>%
arrange(desc(favorite_count)) %>%
select(user_key, created_str, text, favorite_count)
## # A tibble: 203,451 x 4
## user_key created_str text favorite_count
## <chr> <dttm> <chr> <int>
## 1 trayneshacole 2016-11-17 20:27:21 go all the way off … 26655
## 2 jenn_abrams 2016-09-18 16:42:48 "Some guy right in … 16068
## 3 crystal1johnson 2016-10-10 22:55:10 "When It’s slowly b… 12815
## 4 ten_gop 2016-10-11 02:11:02 "OMG, this new Anti… 10867
## 5 ten_gop 2016-10-20 05:40:07 "BREAKING🚨 \n\nHill… 10437
## 6 crystal1johnson 2016-10-09 01:27:00 3 Black children ca… 10340
## 7 trayneshacole 2016-11-10 22:50:19 "White boyfriend sh… 9265
## 8 pamela_moore13 2016-06-16 23:35:47 "I would rather tak… 9242
## 9 crystal1johnson 2016-06-04 01:06:58 Muhammad Ali, the o… 8969
## 10 bleepthepolice 2016-09-21 15:37:27 "Wow. Hadn't though… 8495
## # ... with 203,441 more rows
We can then use ggplot to count the number of tweets per day.
# ggplot
sum <- tweets %>%
group_by(Date = as.Date(created_str)) %>% # convert string to Date format
summarise(Count = n())
# create ggplot
ggplot(data = sum, aes(x = Date, y = Count)) +
geom_line()
tweets %>%
group_by(Date = as.Date(created_str)) %>%
summarise(Count = n())
## # A tibble: 931 x 2
## Date Count
## <date> <int>
## 1 2014-07-14 1
## 2 2014-07-17 1
## 3 2014-07-20 2
## 4 2014-07-22 3
## 5 2014-07-23 1
## 6 2014-07-25 1
## 7 2014-07-26 1
## 8 2014-07-28 1
## 9 2014-07-29 1
## 10 2014-08-04 1
## # ... with 921 more rows
ggplot(data = sum, aes(x = Date, y = Count)) +
geom_line() +
geom_point() +
labs(title = "Daily Tweet Counts by Russian Trolls",
x = "Time (Day)",
y = "Number of Tweets",
caption = "Source: NBC News")
We can also save the plot to an object (m).
# saving as an object
m <- ggplot(data = sum, aes(x = Date, y = Count)) +
geom_line() +
geom_point()
# you can manually call the object
m
With that object, you can add new layers to the plot.
# you can add layers
m + labs(title = "Daily Tweet Counts by Russian Trolls",
x = "Time (Day)",
y = "Number of Tweets",
caption = "Source: NBC News")
Let’s use the stringr package to filter tweets that mention either #keithlamontscott or Charlotte (| is the “OR” operator).
# use stringr for text filter -- part of the tidyverse
library(stringr)
term <- "#keithlamontscott|charlotte"
q <- tweets %>%
filter(str_detect(tolower(tweets$text), tolower(term)))
Let’s now link the data
# with full pipeline
term <- "obama"
q <- tweets %>%
filter(str_detect(tolower(tweets$text), term)) %>%
group_by(Date = as.Date(created_str)) %>%
summarise(Count=n()) %>%
ggplot(aes(x = Date, y = Count)) +
geom_line() +
labs(title = paste0("Tweets filtered by `",term,"`"),
x = "Day",
y = "Number of Tweets")
q
Last, let’s use the plotly package to make this ggplot interactive.
# install.packages("plotly")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(q)