Assignment 6: Twitter Data Analysis

SECTION 1: PIEDMONT PARK TWEETS

Step 1: Download Tweets

We use the keyword Piedmont Park - a popular urban park in Atlanta

#get/search tweets with any words and hashtags   
#as an example 'piedmont park' is used which is save under the data frame name - 'park_tweets'
#'search_tweets'- is the function of 'rtweet' library
#q is the words/hashtags used to search tweets; n is the number of tweets to download at a time. Here it is 100.
park_tweets_all <- search_tweets(q = "piedmont park", n = 200)

#For something more managable:
park_tweets <- park_tweets_all[, sapply(park_tweets_all, Negate(anyNA)), drop = FALSE]

#show headers of the tweets downloaded from piedmont park 
head(park_tweets, n = 5)

## # A tibble: 5 x 28
##   user_id             status_id  created_at          screen_name  text    source
##   <chr>               <chr>      <dttm>              <chr>        <chr>   <chr> 
## 1 1363687901834268674 145355742~ 2021-10-28 03:01:11 MRocio_2     El Jar~ Twitt~
## 2 1385676791629582339 145353573~ 2021-10-28 01:34:59 LauraBa7350~ El Jar~ Twitt~
## 3 1232488463418765312 145353320~ 2021-10-28 01:24:56 Paulacolin31 El Jar~ Twitt~
## 4 1355649350479847427 145352851~ 2021-10-28 01:06:16 Norma_Marins El Jar~ Twitt~
## 5 1243675320823287815 145351969~ 2021-10-28 00:31:15 AleBauGoz    El Jar~ Twitt~
## # ... with 22 more variables: display_text_width <dbl>, is_quote <lgl>,
## #   is_retweet <lgl>, favorite_count <int>, retweet_count <int>, lang <chr>,
## #   geo_coords <list>, coords_coords <list>, bbox_coords <list>,
## #   status_url <chr>, name <chr>, location <chr>, description <chr>,
## #   protected <lgl>, followers_count <int>, friends_count <int>,
## #   listed_count <int>, statuses_count <int>, favourites_count <int>,
## #   account_created_at <dttm>, verified <lgl>, profile_image_url <chr>

Step 2: Barplot

#save the tweets downloaded using rtweet as a csv 
#write_as_csv(park_tweets_all, "park_tweets_all.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")

#unique screen_name and location of those who tweeted about piedmont park 
users <- park_tweets %>% select(screen_name, location) %>% group_by(screen_name) %>% summarise(first(location)) %>% rename(location = `first(location)`)
userlocations <- data.frame(table(users$location))     

#Figure 1 [Your code here: make a bar chart showing the frequency of locations of tweets. Anything related to Atlanta is one bar, and other places are their own bar. If you filter for more than one place (i.e. you don't want to show places that are only mentioned one), make sure to mention this in the title!] As a hint, check out the 'gsub' function below. 
### Here is a 'bad' example, please fix it with the directions above.
barplot(userlocations$Freq, main = "Figure 1: Locations of People who Tweet about Piedmont Park")

# Q 3 
#What percentage of the tweets are retweets? 
Percentage_of_retweets= (sum(park_tweets$is_retweet=='TRUE'))*100/nrow(park_tweets)
cat("Percentage of Retweets = ", Percentage_of_retweets, '%')

## Percentage of Retweets =  85.5 %

#What percentage have geo_coords? 
Percentage_with_georecords=(nrow(park_tweets)-sum(park_tweets$geo_coords=='c(NA, NA)'))*100/nrow(park_tweets)
cat("Percentage of records with Geo coordinates = ", Percentage_with_georecords, '%')

## Percentage of records with Geo coordinates =  0.5 %

# Q4 What are the three most popular sources for the tweets and how many tweets come from each? Why would this information be useful to a developer or to an urban analytics researcher?
table(park_tweets$source)

## 
##      Hootsuite Inc.               IFTTT           Instagram           TweetDeck 
##                   3                   2                   1                   1 
## Twitter for Android    Twitter for iPad  Twitter for iPhone     Twitter Web App 
##                 146                   1                  24                  22

### SECTION 2: YOUR OWN KEYWORD

#### Part 1: Retrieve Tweets

my_twts <- search_tweets(q = "##Manhattan", n = 100,
                                lang = "en",
                                include_rts = FALSE)


#save the tweets downloaded using rtweet as a csv 
write_as_csv(my_twts, "my_twts.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") 

head(my_twts$text,3)

## [1] "Look south on 7th and W 39th. Just another busy day in midtown Manhattan \n•\n<U+0001F4F7>: Don-ya' @photogbydonya \n•\n#manhattan #newyorkcity #photography #nyc #nycphotographer https://t.co/pHpNRGAeW1"                                                                                                     
## [2] "Stay safe this #halloween with our Worldwide chauffeur service - Book a Ride On Time Every Time get 10% Off #holloween Trick or Treat https://t.co/MVZGVEiSeq #airporttransfer #dallasfortworth #limousineservice #newyorkcity #newyork #laxairport #philadelphia #manhattan #chauffeur https://t.co/pc2M6YFbS7"
## [3] "Various photos of #KeithRichards:\n 1) With #BrianJones and #AnitaPallenberg in #Morocco.\n\n2)A portrait from the #StickyFingers session.\n\n3) With #AndrewOldham in #Manhattan, #NY.\n\n#RollingStones #NewYork #Rock #ClassicRock #Music https://t.co/ZPM420FJXx https://t.co/JdbAAZWoMR"

my_twts$cleanedTxt <- gsub("http.*","",  my_twts$text)
my_twts$cleanedTxt <- gsub("https.*","", my_twts$cleanedTxt)
my_twts$cleanedTxt <- gsub("amp*","", my_twts$cleanedTxt)
head(my_twts$cleanedTxt, 3)

## [1] "Look south on 7th and W 39th. Just another busy day in midtown Manhattan \n•\n<U+0001F4F7>: Don-ya' @photogbydonya \n•\n#manhattan #newyorkcity #photography #nyc #nycphotographer "                                                        
## [2] "Stay safe this #halloween with our Worldwide chauffeur service - Book a Ride On Time Every Time get 10% Off #holloween Trick or Treat "                                                                                                     
## [3] "Various photos of #KeithRichards:\n 1) With #BrianJones and #AnitaPallenberg in #Morocco.\n\n2)A portrait from the #StickyFingers session.\n\n3) With #AndrewOldh in #Manhattan, #NY.\n\n#RollingStones #NewYork #Rock #ClassicRock #Music "

Part 2: Tokenization

# We first remove punctuation, convert to lowercase, add id for each tweet!
my_twts_clean <- my_twts %>%
  dplyr::select(cleanedTxt) %>%
  unnest_tokens(word, cleanedTxt)

#Then we will check the number of rows after tokenization
nrow(my_twts_clean)

## [1] 1967

View(my_twts_clean)

# plot the top 15 words and sort them in order of their counts 
my_twts_clean %>%
  count(word, sort = TRUE) %>% 
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "words",
      y = "counts",
      title = "Figure 2: Unique wordcounts found in tweets, with no stop words")

## Selecting by n

#-- Do you observe any problem?

Part 3: Stop Words

# load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
head(stop_words)

## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART

#View(stop_words)

The anti_join function is part of the tidytext package. It removes stop words from the tweet text and saved as cleaned tweet words.

# remove stop words from your list of words
cleanTokens <- my_twts_clean %>% anti_join(stop_words)

## Joining, by = "word"

# Check the number of rows after removal of the stop words. There should be fewer words now
nrow(cleanTokens)

## [1] 1294

# plot the top 10 words -- notice any issues?
cleanTokens %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(y = "count",
       x = "words",
       title = "Figure 3: Unique wordcounts found in tweets after applying stop words",
       subtitle = "Stop words removed from the list")

## Selecting by n

Part 4: Wordclouds

###You may need these
# library(wordcloud2) 
# library(RColorBrewer)
# pal <- brewer.pal(8,"Dark2")
# 
# #Get some frequency counts for each word
# freq_df1 <- cleanTokens %>%
# count(word, sort = TRUE) %>%
# top_n(30) %>%
# mutate(word = reorder(word, n))
# wordcloud2(data = freq_df1, minRotation = 0, maxRotation = 0, ellipticity = 0.6)

###FURTHER CLEANING BASED ON YOUR EXPERTISE! YOUR CODE HERE
my_stopwords <- data.frame(c(stop_words$word, 'Manhattan','manhattan', 'New York', 'newyork','Newyork', 'NewYork', 'New', 'York', 'york', 'USA','nyc', 'NYC', 'newyorkcity', 'city','ny','NY','brooklyn')) ##adding stopwords
colnames(my_stopwords) <- "word"
cleanTokens2 <- my_twts_clean %>%
  anti_join(my_stopwords)

## Joining, by = "word"

####Rereun the freq counts
freq_df2 <- cleanTokens2 %>%
  count(word, sort = TRUE) %>%
  top_n(50) %>%
  mutate(word = reorder(word, n))

## Selecting by n

wordcloud2(data = freq_df2, minRotation = 0, maxRotation = 0, ellipticity = 0.6)

Part 5: N-grams

library(widyr)
#get ngrams. You may try playing around with the value of n, n=3 , n=4 
my_twts_ngram <- my_twts %>%
  dplyr::select(cleanedTxt) %>%
  unnest_tokens(paired_words, cleanedTxt, token = "ngrams", n = 2)

#show ngrams with sorted values
my_twts_ngram %>%
  count(paired_words, sort = TRUE)

## # A tibble: 1,333 x 2
##    paired_words        n
##    <chr>           <int>
##  1 get your           12
##  2 gospel brunch      12
##  3 new york           12
##  4 pilgrims gospel    12
##  5 the pilgrims       12
##  6 manhattan nyc      11
##  7 in manhattan        8
##  8 on the              8
##  9 check out           7
## 10 and earn            6
## # ... with 1,323 more rows

Here we see the ngrams are using stop words such as * a, to, etc.* Next we will try to obtain ngrams occurring without stop words. We will use the separate function of the tidyr library to obtain the paired words in two columns i.e. word 1 and word 2. Subsequently we filter out columns containing stop words using the filter fucntion

library(tidyr)
#separate the paired words into two columns 
my_twts_ngram <- my_twts_ngram %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column 
my_twts_filtered <- my_twts_ngram %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# Sort the new bi-gram (n=2) counts:
my_words_counts <- my_twts_filtered %>%
  count(word1, word2, sort = TRUE)

#head(my_words_counts)

# words occurring in pair after filtering out the stop words
head(my_twts_filtered)

## # A tibble: 6 x 2
##   word1         word2        
##   <chr>         <chr>        
## 1 busy          day          
## 2 midtown       manhattan    
## 3 manhattan     don          
## 4 don           ya           
## 5 ya            photogbydonya
## 6 photogbydonya manhattan

By using the igraph and ggraph library we are trying to visualize the words occuring in pairs. (Note the edges can’t be drawn at this time)

# plot word network
my_words_counts %>%
  filter(n >= 2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8, size = 4) +
  labs(title = "Figure 5: Word Network: Tweets using my hashtag",
       subtitle = "Text mining twitter data",
       x = "", y = "")

End