introducing 2 new packages lower in this lesson: igraph and ggraph.
# load twitter library - the rtweet library is recommended now over twitteR
#install.packages("rtweet")
#install.packages("tidytext")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rtweet)
##
## Attaching package: 'rtweet'
## The following object is masked from 'package:purrr':
##
## flatten
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
# text mining library
library(tidytext)
https://cran.r-project.org/web/packages/rtweet/vignettes/auth.html
api_key <- “djPufbwIyRqftHHHWfCAIvf0n” api_secret_key <- “HgA2x4ZXRX0rM7sknaCYHQWnJSVg6UZytyzeowQlgfzh2vE4Rv” access_token <- “347824918-JGHuXEqxmJB0RQEj75d4cU7hoDF3oy2MMMriFFB1” access_token_secret <- “KPNNECnuAWVrZhDYslNYLr667vud1x0VubTCbC3WSQn8j”
api_key <- "djPufbwIyRqftHHHWfCAIvf0n"
api_secret_key <- "HgA2x4ZXRX0rM7sknaCYHQWnJSVg6UZytyzeowQlgfzh2vE4Rv"
access_token <- "347824918-JGHuXEqxmJB0RQEj75d4cU7hoDF3oy2MMMriFFB1"
access_token_secret <- "KPNNECnuAWVrZhDYslNYLr667vud1x0VubTCbC3WSQn8j"
## authenticate via web browser
token <- create_token(
app = "DATA110First",
consumer_key = api_key,
consumer_secret = api_secret_key,
access_token = access_token,
access_secret = access_token_secret)
words that occur together in tweets
ngrams specifies pairs and 2 is the number of words together
#library(devtools)
#install_github("dgrtwo/widyr")
library(widyr)
# remove punctuation, convert to lowercase, add id for each tweet!
climate_tweets_paired_words <- climate_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)
climate_tweets_paired_words %>%
count(paired_words, sort = TRUE)
## # A tibble: 25,941 x 2
## paired_words n
## <chr> <int>
## 1 greenhouse gas 706
## 2 climate change 533
## 3 gas emissions 465
## 4 greenhouse gases 314
## 5 of the 257
## 6 in the 172
## 7 of greenhouse 150
## 8 the climate 113
## 9 the greenhouse 94
## 10 to the 93
## # … with 25,931 more rows
Again, eliminate the stop words from the paired words
library(tidyr)
climate_tweets_separated_words <- climate_tweets_paired_words %>%
separate(paired_words, c("word1", "word2"), sep = " ")
climate_tweets_filtered <- climate_tweets_separated_words %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
climate_words_counts <- climate_tweets_filtered %>%
count(word1, word2, sort = TRUE)
head(climate_words_counts)
## # A tibble: 6 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 greenhouse gas 706
## 2 climate change 533
## 3 gas emissions 465
## 4 greenhouse gases 314
## 5 greenhouse effect 88
## 6 reduce greenhouse 73
# plotting packages
# install.packages("ggraph")
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
# plot climate word network
# (plotting graph edges is currently broken)
climate_words_counts %>%
filter(n >= 24) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(color = "blue4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = "Word Network: Tweets using the hashtag - Climate and Greenhouse",
subtitle = "Text mining twitter data ",
x = "", y = "")