#STEP 1: Load the rtweet and other needed R packages.
# load twitter library - the rtweet library is recommended now over twitteR
#install.packages('rvest')
#Loading the rvest package
library('rvest')
## Warning: package 'rvest' was built under R version 4.1.3
#install.packages('rtweet')
library(rtweet)
## Warning: package 'rtweet' was built under R version 4.1.3
# plotting and pipes - tidyverse!
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# text mining library
#install.packages('tidytext')
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
# plotting packages
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
#install.packages('ggraph')
library(ggraph)
## Warning: package 'ggraph' was built under R version 4.1.3
#Access Token/Secret Method
api_key <- "4ktmi3NtrGLKR9G78Vb6ToU0T"
api_secret_key <- "fPMCTFVfX3A97uUHsdCouQ4HfCAY3KiGa2wr3vnzG5vuImRe5E"
access_token <- "1510379857804501000-G0rymo6VHyedhWqLtMkuVpGRL3oItc"
access_token_secret <- "ndL2pOhnQfL44NpvyTobXnZW0y96vPdO3oLhbU7OcRxdF"
## authenticate via web browser
token <- create_token(
app = "sileshiproject1",
consumer_key = api_key,
consumer_secret = api_secret_key,
access_token = access_token,
access_secret = access_token_secret)
get_token()
## <Token>
## <oauth_endpoint>
## request: https://api.twitter.com/oauth/request_token
## authorize: https://api.twitter.com/oauth/authenticate
## access: https://api.twitter.com/oauth/access_token
## <oauth_app> sileshiproject1
## key: 4ktmi3NtrGLKR9G78Vb6ToU0T
## secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---
#Search for Tweets Related to Climate
climate_tweets <- search_tweets(q = "climate change", n = 10000, lang = "en",include_rts = FALSE)
## Warning: Rate limit exceeded - 88
## Warning: Rate limit exceeded
# check data to see if there are emojis
head(climate_tweets$text)
## [1] "@shaunnapiranha1 @karen55777911 @motogpbsb Statically you’re be a Putin & Trump supporter .. climate change sceptic, anti vac , 9/11 ‘truther’ & possibly believe the moon landings were faked.. you see conspiracy everywhere… I’d suggest it’s you being ‘played’ by fantasists on social media"
## [2] "@JH4PDX Climate change is a total lie. It was made up along with the use of UFOs and aliens in Iron Mountain, NY as a way to scare the people into accepting a new world communist government. Please study instead of buying government BS!"
## [3] "#ClimateChange the Tool of Control #HumanRights #Propaganda #Censorship #GlobalWarming #GlobalCooling #MiniIceAge #SolarCycles #SunSpots #SolarMinimums #PoleShifts #Taxes #Pseudoscience #Socialism #ClimateHoax #Facisim https://t.co/HvnsfonTT8 https://t.co/46rpsCQF4O"
## [4] "We don't need more IPCC reports to take action on climate change: creating human meaning and connection is critical now. Terrific from @MillieRooney #Science4PublicGood @AustraliareMADE"
## [5] "@TimWilsonMP @GetUp When your climate change denier team do it to your opponent it’s fine with you https://t.co/QXsn24ECpY"
## [6] "Economists care about climate change more than any other election issue. #auspol Not one of 50 top economists listed lower taxes as an important election issue. One said “Neither major party was offering anything substantive “. https://t.co/zh3Xs9ePpf"
#Data Clean-Up
# remove urls tidyverse is failing here for some reason
# climate_tweets %>%
# mutate_at(c("stripped_text"), gsub("http.*","",.))
# remove http elements manually
climate_tweets$stripped_text <- gsub("http.*","", climate_tweets$text)
climate_tweets$stripped_text <- gsub("https.*","", climate_tweets$stripped_text)
# note the words that are recognized as unique by R
a_list_of_words <- c("Dog", "dog", "dog", "cat", "cat", ",")
unique(a_list_of_words)
## [1] "Dog" "dog" "cat" ","
## [1] "Dog" "dog" "cat" ","
## [1] "Dog" "dog" "cat" ","
# remove punctuation, convert to lowercase, add id for each tweet!
climate_tweets_clean <- climate_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
# plot the top 15 words -- notice any issues?
climate_tweets_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
## Selecting by n
#Explore Networks of Words
# load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
head(stop_words)
## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
T
## [1] TRUE
nrow(climate_tweets_clean)
## [1] 223516
## [1] 237945
## [1] 230984 or something similar
# remove stop words from your list of words
cleaned_tweet_words <- climate_tweets_clean %>%
anti_join(stop_words)
## Joining, by = "word"
## Joining, by = "word"
# there should be fewer words now
nrow(cleaned_tweet_words)
## [1] 115351
## [1] 122857
## [1] 118701 or something similar
10
library(devtools)
## Loading required package: usethis
install_github("dgrtwo/widyr")
## Skipping install of 'widyr' from a github remote, the SHA1 (6312a8ef) has not changed since last install.
## Use `force = TRUE` to force installation
library(widyr)
# remove punctuation, convert to lowercase, add id for each tweet!
climate_tweets_paired_words <- climate_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)
climate_tweets_paired_words %>%
count(paired_words, sort = TRUE)
## # A tibble: 109,097 x 2
## paired_words n
## <chr> <int>
## 1 climate change 6912
## 2 of the 672
## 3 change is 638
## 4 in the 618
## 5 change and 596
## 6 of climate 535
## 7 on climate 470
## 8 is a 427
## 9 about climate 378
## 10 the climate 343
## # ... with 109,087 more rows
... is not empty.needs_dotsAgain, eliminate the stop words from the paired word
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
##
## crossing
climate_tweets_separated_words <- climate_tweets_paired_words %>%
separate(paired_words, c("word1", "word2"), sep = " ")
climate_tweets_filtered <- climate_tweets_separated_words %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
climate_words_counts <- climate_tweets_filtered %>%
count(word1, word2, sort = TRUE)
head(climate_words_counts)
## # A tibble: 6 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 climate change 6912
## 2 aged care 189
## 3 fossil fuels 181
## 4 milliganreports climate 174
## 5 global warming 139
## 6 change amp 96
#Finally, plot the data (not the prettiest graph, but interesting)
library(igraph)
library(ggraph)
# plot climate change word network
# (plotting graph edges is currently broken)
climate_words_counts %>%
filter(n >= 24) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = "Word Network: Tweets using the hashtag - Climate Change",
subtitle = "Text mining twitter data ",
x = "", y = "")
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"