Ethics of Minining and web scraping

#STEP 1: Load the rtweet and other needed R packages.

# load twitter library - the rtweet library is recommended now over twitteR
#install.packages('rvest')
#Loading the rvest package
library('rvest')

## Warning: package 'rvest' was built under R version 4.1.3

#install.packages('rtweet')
library(rtweet)

## Warning: package 'rtweet' was built under R version 4.1.3

# plotting and pipes - tidyverse!

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.1.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# text mining library
#install.packages('tidytext')
library(tidytext)

## Warning: package 'tidytext' was built under R version 4.1.3

# plotting packages
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

#install.packages('ggraph')
library(ggraph)

## Warning: package 'ggraph' was built under R version 4.1.3

#Access Token/Secret Method

api_key <- "4ktmi3NtrGLKR9G78Vb6ToU0T"
api_secret_key <- "fPMCTFVfX3A97uUHsdCouQ4HfCAY3KiGa2wr3vnzG5vuImRe5E"
access_token <- "1510379857804501000-G0rymo6VHyedhWqLtMkuVpGRL3oItc"
access_token_secret <- "ndL2pOhnQfL44NpvyTobXnZW0y96vPdO3oLhbU7OcRxdF"
## authenticate via web browser
token <- create_token(
 app = "sileshiproject1",
 consumer_key = api_key,
 consumer_secret = api_secret_key,
 access_token = access_token,
 access_secret = access_token_secret)
get_token()

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> sileshiproject1
##   key:    4ktmi3NtrGLKR9G78Vb6ToU0T
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

#Search for Tweets Related to Climate

climate_tweets <- search_tweets(q = "climate change", n = 10000, lang = "en",include_rts = FALSE)

## Warning: Rate limit exceeded - 88

## Warning: Rate limit exceeded

# check data to see if there are emojis
head(climate_tweets$text)

## [1] "@shaunnapiranha1 @karen55777911 @motogpbsb Statically you’re be a Putin &amp; Trump supporter .. climate change sceptic, anti vac , 9/11 ‘truther’ &amp; possibly believe the moon landings were faked..  you see conspiracy everywhere… I’d suggest it’s you being ‘played’ by fantasists on social media"
## [2] "@JH4PDX Climate change is a total lie. It was made up along with the use of UFOs and aliens in Iron Mountain, NY as a way to scare the people into accepting a new world communist government. Please study instead of buying government BS!"                                                              
## [3] "#ClimateChange the Tool of Control #HumanRights #Propaganda #Censorship #GlobalWarming #GlobalCooling #MiniIceAge #SolarCycles #SunSpots #SolarMinimums #PoleShifts #Taxes #Pseudoscience #Socialism #ClimateHoax #Facisim https://t.co/HvnsfonTT8 https://t.co/46rpsCQF4O"                                
## [4] "We don't need more IPCC reports to take action on climate change: creating human meaning and connection is critical now. Terrific from @MillieRooney #Science4PublicGood @AustraliareMADE"                                                                                                                 
## [5] "@TimWilsonMP @GetUp When your climate change denier team do it to your opponent it’s fine with you https://t.co/QXsn24ECpY"                                                                                                                                                                                
## [6] "Economists care about climate change more than any other election issue. #auspol  Not one of 50 top economists listed lower taxes as an important election issue. One said “Neither major party was offering anything substantive “.  https://t.co/zh3Xs9ePpf"

#Data Clean-Up

# remove urls tidyverse is failing here for some reason
# climate_tweets %>%
# mutate_at(c("stripped_text"), gsub("http.*","",.))
# remove http elements manually
climate_tweets$stripped_text <- gsub("http.*","", climate_tweets$text)
climate_tweets$stripped_text <- gsub("https.*","", climate_tweets$stripped_text)

# note the words that are recognized as unique by R
a_list_of_words <- c("Dog", "dog", "dog", "cat", "cat", ",")
unique(a_list_of_words)

## [1] "Dog" "dog" "cat" ","

## [1] "Dog" "dog" "cat" ","
## [1] "Dog" "dog" "cat" ","

# remove punctuation, convert to lowercase, add id for each tweet!
climate_tweets_clean <- climate_tweets %>%
 dplyr::select(stripped_text) %>%
 unnest_tokens(word, stripped_text)

plot the top 15 words – notice any issues?

# plot the top 15 words -- notice any issues?
climate_tweets_clean %>%
 count(word, sort = TRUE) %>%
 top_n(15) %>%
 mutate(word = reorder(word, n)) %>%
 ggplot(aes(x = word, y = n)) +
 geom_col() +
 xlab(NULL) +
 coord_flip() +
 labs(x = "Count",
 y = "Unique words",
 title = "Count of unique words found in tweets")

## Selecting by n

#Explore Networks of Words

# load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
head(stop_words)

## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART

## [1] TRUE

nrow(climate_tweets_clean)

## [1] 223516

## [1] 237945
## [1] 230984 or something similar
# remove stop words from your list of words
cleaned_tweet_words <- climate_tweets_clean %>%
  anti_join(stop_words)

## Joining, by = "word"

## Joining, by = "word"
# there should be fewer words now
nrow(cleaned_tweet_words)

## [1] 115351

## [1] 122857
## [1] 118701 or something similar

 library(devtools)

## Loading required package: usethis

 install_github("dgrtwo/widyr")

## Skipping install of 'widyr' from a github remote, the SHA1 (6312a8ef) has not changed since last install.
##   Use `force = TRUE` to force installation

library(widyr)
# remove punctuation, convert to lowercase, add id for each tweet!
climate_tweets_paired_words <- climate_tweets %>%
 dplyr::select(stripped_text) %>%
 unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)
climate_tweets_paired_words %>%
 count(paired_words, sort = TRUE)

## # A tibble: 109,097 x 2
##    paired_words       n
##    <chr>          <int>
##  1 climate change  6912
##  2 of the           672
##  3 change is        638
##  4 in the           618
##  5 change and       596
##  6 of climate       535
##  7 on climate       470
##  8 is a             427
##  9 about climate    378
## 10 the climate      343
## # ... with 109,087 more rows

Warning: `...` is not empty.

We detected these problematic arguments:

* `needs_dots`

These dots only exist to allow future extensions and should be empty.

Did you misspecify an argument?

# A tibble: 120,241 x 2

paired_words n

1 climate change 7181

2 change is 827

3 of the 757

4 in the 724

5 change and 644

6 of climate 555

7 on climate 513

8 is a 501

9 on the 407

10 about climate 389

# … with 120,231 more rows

# A tibble: 134,656 x 2

paired_words n

1 climate change 1021

2 of the 804

3 in the 798

4 climatechange is 570

5 is a 442

6 of climatechange 437

7 on the 383

8 on climatechange 364

9 to the 354

10 this is 331

# . with 134,646 more rows

Again, eliminate the stop words from the paired word

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:igraph':
## 
##     crossing

climate_tweets_separated_words <- climate_tweets_paired_words %>%
 separate(paired_words, c("word1", "word2"), sep = " ")
climate_tweets_filtered <- climate_tweets_separated_words %>%
 filter(!word1 %in% stop_words$word) %>%
 filter(!word2 %in% stop_words$word)
# new bigram counts:
climate_words_counts <- climate_tweets_filtered %>%
 count(word1, word2, sort = TRUE)
head(climate_words_counts)

## # A tibble: 6 x 3
##   word1           word2       n
##   <chr>           <chr>   <int>
## 1 climate         change   6912
## 2 aged            care      189
## 3 fossil          fuels     181
## 4 milliganreports climate   174
## 5 global          warming   139
## 6 change          amp        96

# A tibble: 6 x 3

word1 word2 n

1 climate change 7181

2 fight climate 185

3 covid 19 164

4 frack frack 114

5 global warming 113

6 foreign policy 98

# A tibble: 6 x 3

word1 word2 n

1 climate change 1021

2 climatechange climatecrisis 232

3 climatechange globalwarming 147

4 global warming 141

5 hurricane dorian 113

6 climate crisis 100

#Finally, plot the data (not the prettiest graph, but interesting)

library(igraph)
library(ggraph)
# plot climate change word network
# (plotting graph edges is currently broken)
climate_words_counts %>%
 filter(n >= 24) %>%
 graph_from_data_frame() %>%
 ggraph(layout = "fr") +
 geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
 geom_node_point(color = "darkslategray4", size = 3) +
 geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
 labs(title = "Word Network: Tweets using the hashtag - Climate Change",
 subtitle = "Text mining twitter data ",
 x = "", y = "")

## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"

Ethics of Minining and web scraping

sileshi

4/10/2022

plot the top 15 words – notice any issues?

Warning: ... is not empty.

We detected these problematic arguments:

* needs_dots

These dots only exist to allow future extensions and should be empty.

Did you misspecify an argument?

# A tibble: 120,241 x 2

paired_words n

1 climate change 7181

2 change is 827

3 of the 757

4 in the 724

5 change and 644

6 of climate 555

7 on climate 513

8 is a 501

9 on the 407

10 about climate 389

# … with 120,231 more rows

# A tibble: 134,656 x 2

paired_words n

1 climate change 1021

2 of the 804

3 in the 798

4 climatechange is 570

5 is a 442

6 of climatechange 437

7 on the 383

8 on climatechange 364

9 to the 354

10 this is 331

# . with 134,646 more rows

# A tibble: 6 x 3

word1 word2 n

1 climate change 7181

2 fight climate 185

3 covid 19 164

4 frack frack 114

5 global warming 113

6 foreign policy 98

# A tibble: 6 x 3

word1 word2 n

1 climate change 1021

2 climatechange climatecrisis 232

3 climatechange globalwarming 147

4 global warming 141

5 hurricane dorian 113

6 climate crisis 100

Warning: `...` is not empty.

* `needs_dots`