Mining for Tweets Tutorial

Load the rtweet and other needed R packages.

introducing 2 new packages lower in this lesson: igraph and ggraph.

# load twitter library - the rtweet library is recommended now over twitteR
#install.packages("rtweet")
#install.packages("tidytext")
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(rtweet)

## 
## Attaching package: 'rtweet'

## The following object is masked from 'package:purrr':
## 
##     flatten

# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
# text mining library
library(tidytext)

Create a twitter developer account

https://cran.r-project.org/web/packages/rtweet/vignettes/auth.html

store api keys ( own keys)

api_key <- “djPufbwIyRqftHHHWfCAIvf0n” api_secret_key <- “HgA2x4ZXRX0rM7sknaCYHQWnJSVg6UZytyzeowQlgfzh2vE4Rv” access_token <- “347824918-JGHuXEqxmJB0RQEj75d4cU7hoDF3oy2MMMriFFB1” access_token_secret <- “KPNNECnuAWVrZhDYslNYLr667vud1x0VubTCbC3WSQn8j”

api_key <- "djPufbwIyRqftHHHWfCAIvf0n"
api_secret_key <- "HgA2x4ZXRX0rM7sknaCYHQWnJSVg6UZytyzeowQlgfzh2vE4Rv"
access_token <- "347824918-JGHuXEqxmJB0RQEj75d4cU7hoDF3oy2MMMriFFB1"
access_token_secret <- "KPNNECnuAWVrZhDYslNYLr667vud1x0VubTCbC3WSQn8j"

## authenticate via web browser
token <- create_token(
  app = "DATA110First",
  consumer_key = api_key,
  consumer_secret = api_secret_key,
  access_token = access_token,
  access_secret = access_token_secret)

Authorization in future R sessions

The create_token() function saves your token as an environment variable

Search for Tweets Related to climate and greenhouse

tutorial on the following website:

https://www.earthdatascience.org/courses/earth-analytics/get-data-using-apis/text-mining-twitter-data-intro-r/

find tweets that are using the words “climate and greenhouse” in them.

climate_tweets <- search_tweets(q = "climate greenhouse", n = 10000,
                                      lang = "en",
                                      include_rts = FALSE)
# check data to see if there are emojis
#head(climate_tweets$text)

Look at the results.

Change the code below to do a new query: If you set your query to q=“climate+change” then the API fill find tweets that use the words together in a string rather than across the entire string. Let’s try it.

# Find tweet using climate+greenhouse together in them
climate_tweets <- search_tweets(q = "climate+greenhouse", n = 10000, lang = "en",
                             include_rts = FALSE)
# check data to see if there are emojis
#head(climate_tweets$text)

Data Clean-Up

URL’s won’t be helpful. remove those.

# remove urls tidyverse is failing here for some reason
# climate_tweets %>%
#  mutate_at(c("stripped_text"), gsub("http.*","",.))

# remove http elements manually
climate_tweets$stripped_text <- gsub("http.*","",  climate_tweets$text)
climate_tweets$stripped_text <- gsub("https.*","", climate_tweets$stripped_text)

tidytext::unnest_tokens() function in the tidytext package will clean up the text:

Convert text to lowercase: each word found in the text will be converted to lowercase, so ensure that you don’t get duplicate words due to variation in capitalization.

Punctuation is removed: all instances of periods, commas etc will be removed from your list of words , and Unique id associated with the tweet: will be added for each occurrence of the word

The unnest_tokens() function takes two arguments:

The name of the column where the unique word will be stored and
The column name from the data.frame that you are using that you want to pull unique words from.

# remove punctuation, convert to lowercase, add id for each tweet!
climate_tweets_clean <- climate_tweets %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(word, stripped_text)

plot data

# plot the top 15 words -
climate_tweets_clean %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets")

## Selecting by n

plot of unique words contains “stop_words”. Load the stop_words data included with tidytext. a list of words to remove in a natural language analysis. use anti_join to remove all stop words from your analysis.

# load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
# head(stop_words)
nrow(climate_tweets_clean)

## [1] 47184

# remove stop words from your list of words
cleaned_tweet_words <- climate_tweets_clean %>%
anti_join(stop_words)

## Joining, by = "word"

# there should be fewer words now
nrow(cleaned_tweet_words)

## [1] 26724

## [1] 118701 or something similar

explore common words and plot again

cleaned_tweet_words %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets")

## Selecting by n

Explore Networks of Words

words that occur together in tweets

ngrams specifies pairs and 2 is the number of words together

#library(devtools)
#install_github("dgrtwo/widyr")
library(widyr)

# remove punctuation, convert to lowercase, add id for each tweet!
climate_tweets_paired_words <- climate_tweets %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)

climate_tweets_paired_words %>%
  count(paired_words, sort = TRUE)

## # A tibble: 25,941 x 2
##    paired_words         n
##    <chr>            <int>
##  1 greenhouse gas     706
##  2 climate change     533
##  3 gas emissions      465
##  4 greenhouse gases   314
##  5 of the             257
##  6 in the             172
##  7 of greenhouse      150
##  8 the climate        113
##  9 the greenhouse      94
## 10 to the              93
## # … with 25,931 more rows

Again, eliminate the stop words from the paired words

library(tidyr)
climate_tweets_separated_words <- climate_tweets_paired_words %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

climate_tweets_filtered <- climate_tweets_separated_words %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
climate_words_counts <- climate_tweets_filtered %>%
  count(word1, word2, sort = TRUE)

head(climate_words_counts)

## # A tibble: 6 x 3
##   word1      word2          n
##   <chr>      <chr>      <int>
## 1 greenhouse gas          706
## 2 climate    change       533
## 3 gas        emissions    465
## 4 greenhouse gases        314
## 5 greenhouse effect        88
## 6 reduce     greenhouse    73

Finally, plot the data

# plotting packages
# install.packages("ggraph")
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(ggraph)

# plot climate  word network
# (plotting graph edges is currently broken)
climate_words_counts %>%
        filter(n >= 24) %>%
        graph_from_data_frame() %>%
        ggraph(layout = "fr") +
        geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
        geom_node_point(color = "blue4", size = 3) +
        geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
        labs(title = "Word Network: Tweets using the hashtag - Climate and Greenhouse",
             subtitle = "Text mining twitter data ",
             x = "", y = "")