library("rtweet")
library("ggplot2")
library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("tidytext")
library("tidyverse")

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ tibble  3.1.4     ✓ purrr   0.3.4
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()  masks stats::filter()
## x purrr::flatten() masks rtweet::flatten()
## x dplyr::lag()     masks stats::lag()

library("igraph")

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library("ggraph")
library("tidyr")
library("wordcloud2")
library("textdata")

appname <- "mytwitterapp"

twitter_token <- create_token(
  app = appname,
  consumer_key = "GHgjnXIex7qjWHly8MTjhi4Bv",
  consumer_secret = "GFPIOETMiBobEeEMil3mdv7ESvZN2vWg2nQ9v0BAps4HOfe6pe",
  access_token = "475638518-9CvROETaYyp9lEGaumxFAk9vCUY8ShGi5dEtotVn",
  access_secret = "FxkHNzvf9ixqWF6TIT5hDoyW29d9TQK8ts1Ywb4LR6bnX")

Q1

park_tweets_all <- search_tweets(q = "Piedmont Park", n = 200)
head(park_tweets_all, n = 10)

## # A tibble: 10 × 90
##    user_id             status_id  created_at          screen_name text    source
##    <chr>               <chr>      <dttm>              <chr>       <chr>   <chr> 
##  1 1262808354256084992 145454962… 2021-10-30 20:43:50 BonifacioB… "El Ja… Twitt…
##  2 1395865041056980993 145454457… 2021-10-30 20:23:45 739lsil     "El Ja… Twitt…
##  3 233997042           145453876… 2021-10-30 20:00:41 put_up_or_… "I’ll … Twitt…
##  4 1381392917273710594 145452445… 2021-10-30 19:03:47 NellieC059… "Piedm… Twitt…
##  5 1192138537279348736 145447049… 2021-10-30 15:29:24 lappsga     "It’s … Twitt…
##  6 1309321100          145445723… 2021-10-30 14:36:41 jballen5    "It’s … Twitt…
##  7 1309321100          145422429… 2021-10-29 23:11:05 jballen5    "#Scho… Twitt…
##  8 1020141119521378305 145445324… 2021-10-30 14:20:49 FAToomerPTA "It’s … Twitt…
##  9 1083564197055815681 145445142… 2021-10-30 14:13:36 Tish123456… "It’s … Twitt…
## 10 1083564197055815681 145422041… 2021-10-29 22:55:39 Tish123456… "#Scho… Twitt…
## # … with 84 more variables: display_text_width <dbl>, reply_to_status_id <chr>,
## #   reply_to_user_id <chr>, reply_to_screen_name <chr>, is_quote <lgl>,
## #   is_retweet <lgl>, favorite_count <int>, retweet_count <int>,
## #   quote_count <int>, reply_count <int>, hashtags <list>, symbols <list>,
## #   urls_url <list>, urls_t.co <list>, urls_expanded_url <list>,
## #   media_url <list>, media_t.co <list>, media_expanded_url <list>,
## #   media_type <list>, ext_media_url <list>, ext_media_t.co <list>, …

Q3

table(park_tweets_all$is_retweet)

## 
## FALSE  TRUE 
##    51   149

Q4

park_tweets <- park_tweets_all[, sapply(park_tweets_all, Negate(anyNA)), drop = FALSE]
table(park_tweets$source)

## 
##              Buffer      Hootsuite Inc.               IFTTT           Instagram 
##                   1                   5                   2                   1 
##         Post Studio            shareist         True Anthem           TweetDeck 
##                   1                   1                   3                   2 
## Twitter for Android    Twitter for iPad  Twitter for iPhone     Twitter Web App 
##                 122                   1                  38                  23

Figure 1

users <- park_tweets %>% select(screen_name, location) %>% group_by(screen_name) %>% summarise(first(location)) %>% rename(location = `first(location)`)

users$location <- gsub("Atlanta, GA","Atlanta",users$location)
users$location <- gsub("Atlanta Georgia","Atlanta",users$location)
users$location <- gsub("Atlanta, Ga.","Atlanta",users$location)
users$location <- gsub("Atlanta, Georgia","Atlanta",users$location)
users$location <- gsub("Atlanta, USA","Atlanta",users$location)
users$location <- gsub("ATL","Atlanta",users$location)
users$location <- gsub("Atlanta ","Atlanta",users$location)

userlocations <- data.frame(table(users$location))

barplot(userlocations$Freq, main = "Figure 1: Locations of People who Tweet about Piedmont Park")

My tweets - Keyword: Graduate School

my_twts <- search_tweets(q = "##Graduate School", n = 100,
                                lang = "en",
                                include_rts = FALSE) #include retweets = false

write_as_csv(my_twts, "my_twts.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") 

my_twts$cleanedTxt <- gsub("http.*","",  my_twts$text) #cleaning data, stands for 'substitute'
my_twts$cleanedTxt <- gsub("https.*","", my_twts$cleanedTxt)
my_twts$cleanedTxt <- gsub("amp*","", my_twts$cleanedTxt)

head(my_twts$cleanedTxt, 3)

## [1] "At a minimum a #FireHazard. #UCSB  should be prepared to face #multimillion $$ wrongful death student suicide lawsuits. College can be stressful. In my #undergraduate days my school suffered 10 suicides annually. Hushed up. Not as many, but also a few in #graduate school. 🤷 "
## [2] "Planning Support #assistant\n£25,000 per annum &; excellent benefits\nThis role is an ideal first role for a #graduate or school or college leaver looking to start their career! find out more now!\n👉"                                                                            
## [3] "New Role! Planning Support #assistant \n£25,000 per annum &; excellent benefits \nThis role is an ideal first role for a #graduate or school or college leaver looking to start their career! find out more now! \n👉"

Q6

my_twts_clean <- my_twts %>%
  dplyr::select(cleanedTxt) %>%
  unnest_tokens(word, cleanedTxt)

nrow(my_twts_clean)

## [1] 1025

Q5 (in Step 5)

my_twts_clean %>%
  count(word, sort = TRUE) %>% 
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "words",
      y = "counts",
      title = "Figure 2-1: Unique wordcounts found in tweets, with no stop words")

## Selecting by n

Q6

cleanTokens <- my_twts_clean %>% anti_join(stop_words)

## Joining, by = "word"

nrow(cleanTokens)

## [1] 577

Q7 and Figure2

my_stopwords <- data.frame(c(stop_words$word, 'graduate', 'school'))
colnames(my_stopwords) <- "word"
cleanTokens2 <- my_twts_clean %>%
  anti_join(my_stopwords)

## Joining, by = "word"

cleanTokens2 %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(y = "count",
       x = "words",
       title = "Figure 2-2: Unique wordcounts found in tweets after applying stop words",
       subtitle = "Stop words removed from the list")

## Selecting by n

Figure 3

freq_df1 <- cleanTokens %>%
  count(word, sort = TRUE) %>%
  top_n(50) %>%
  mutate(word = reorder(word, n))

## Selecting by n

wordcloud2(data = freq_df1, minRotation = 0, maxRotation = 0, ellipticity = 0.6)

freq_df2 <- cleanTokens2 %>%
  count(word, sort = TRUE) %>%
  top_n(50) %>%
  mutate(word = reorder(word, n))

## Selecting by n

wordcloud2(data = freq_df2, minRotation = 0, maxRotation = 0, ellipticity = 0.6)

Figure 4

library(widyr)
my_twts_ngram <- my_twts %>%
  dplyr::select(cleanedTxt) %>%
  unnest_tokens(paired_words, cleanedTxt, token = "ngrams", n = 3)

my_twts_ngram %>%
  count(paired_words, sort = TRUE)

## # A tibble: 844 × 2
##    paired_words                  n
##    <chr>                     <int>
##  1 apa mla dissertation          6
##  2 college essays graduate       6
##  3 deadline goals school         6
##  4 dissertation thesis major     6
##  5 essayhelp papers help         6
##  6 essays graduate apa           6
##  7 grade essayhelp papers        6
##  8 graduate apa mla              6
##  9 graduation deadline goals     6
## 10 help graduation deadline      6
## # … with 834 more rows

library(tidyr)
my_twts_ngram <- my_twts_ngram %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

## Warning: Expected 2 pieces. Additional pieces discarded in 965 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

my_twts_filtered <- my_twts_ngram %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

my_words_counts <- my_twts_filtered %>%
  count(word1, word2, sort = TRUE)

head(my_twts_filtered)

## # A tibble: 6 × 2
##   word1        word2   
##   <chr>        <chr>   
## 1 firehazard   ucsb    
## 2 multimillion wrongful
## 3 wrongful     death   
## 4 death        student 
## 5 student      suicide 
## 6 suicide      lawsuits

my_words_counts %>%
  filter(n >= 2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8, size = 4) +
  labs(title = "Figure 4: Word Network: Tweets using my hashtag",
       subtitle = "Text mining twitter data",
       x = "", y = "")

Assignment 6

Q1

Q3

Q4

Figure 1

My tweets - Keyword: Graduate School

Q6

Q5 (in Step 5)

Q6

Q7 and Figure2

Figure 3

Figure 4