We use the keyword Piedmont Park - a popular urban park in Atlanta
#get/search tweets with any words and hashtags
#as an example 'piedmont park' is used which is save under the data frame name - 'park_tweets'
#'search_tweets'- is the function of 'rtweet' library
#q is the words/hashtags used to search tweets; n is the number of tweets to download at a time. Here it is 100.
park_tweets_all <- search_tweets(q = "piedmont park", n = 200)
#For something more managable:
park_tweets <- park_tweets_all[, sapply(park_tweets_all, Negate(anyNA)), drop = FALSE]
#show headers of the tweets downloaded from piedmont park
head(park_tweets, n = 5)
## # A tibble: 5 x 28
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 1363687901834268674 145355742~ 2021-10-28 03:01:11 MRocio_2 El Jar~ Twitt~
## 2 1385676791629582339 145353573~ 2021-10-28 01:34:59 LauraBa7350~ El Jar~ Twitt~
## 3 1232488463418765312 145353320~ 2021-10-28 01:24:56 Paulacolin31 El Jar~ Twitt~
## 4 1355649350479847427 145352851~ 2021-10-28 01:06:16 Norma_Marins El Jar~ Twitt~
## 5 1243675320823287815 145351969~ 2021-10-28 00:31:15 AleBauGoz El Jar~ Twitt~
## # ... with 22 more variables: display_text_width <dbl>, is_quote <lgl>,
## # is_retweet <lgl>, favorite_count <int>, retweet_count <int>, lang <chr>,
## # geo_coords <list>, coords_coords <list>, bbox_coords <list>,
## # status_url <chr>, name <chr>, location <chr>, description <chr>,
## # protected <lgl>, followers_count <int>, friends_count <int>,
## # listed_count <int>, statuses_count <int>, favourites_count <int>,
## # account_created_at <dttm>, verified <lgl>, profile_image_url <chr>
#save the tweets downloaded using rtweet as a csv
#write_as_csv(park_tweets_all, "park_tweets_all.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
#unique screen_name and location of those who tweeted about piedmont park
users <- park_tweets %>% select(screen_name, location) %>% group_by(screen_name) %>% summarise(first(location)) %>% rename(location = `first(location)`)
userlocations <- data.frame(table(users$location))
#Figure 1 [Your code here: make a bar chart showing the frequency of locations of tweets. Anything related to Atlanta is one bar, and other places are their own bar. If you filter for more than one place (i.e. you don't want to show places that are only mentioned one), make sure to mention this in the title!] As a hint, check out the 'gsub' function below.
### Here is a 'bad' example, please fix it with the directions above.
barplot(userlocations$Freq, main = "Figure 1: Locations of People who Tweet about Piedmont Park")
# Q 3
#What percentage of the tweets are retweets?
Percentage_of_retweets= (sum(park_tweets$is_retweet=='TRUE'))*100/nrow(park_tweets)
cat("Percentage of Retweets = ", Percentage_of_retweets, '%')
## Percentage of Retweets = 85.5 %
#What percentage have geo_coords?
Percentage_with_georecords=(nrow(park_tweets)-sum(park_tweets$geo_coords=='c(NA, NA)'))*100/nrow(park_tweets)
cat("Percentage of records with Geo coordinates = ", Percentage_with_georecords, '%')
## Percentage of records with Geo coordinates = 0.5 %
# Q4 What are the three most popular sources for the tweets and how many tweets come from each? Why would this information be useful to a developer or to an urban analytics researcher?
table(park_tweets$source)
##
## Hootsuite Inc. IFTTT Instagram TweetDeck
## 3 2 1 1
## Twitter for Android Twitter for iPad Twitter for iPhone Twitter Web App
## 146 1 24 22
### SECTION 2: YOUR OWN KEYWORD
#### Part 1: Retrieve Tweets
my_twts <- search_tweets(q = "##Manhattan", n = 100,
lang = "en",
include_rts = FALSE)
#save the tweets downloaded using rtweet as a csv
write_as_csv(my_twts, "my_twts.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
head(my_twts$text,3)
## [1] "Look south on 7th and W 39th. Just another busy day in midtown Manhattan \n•\n<U+0001F4F7>: Don-ya' @photogbydonya \n•\n#manhattan #newyorkcity #photography #nyc #nycphotographer https://t.co/pHpNRGAeW1"
## [2] "Stay safe this #halloween with our Worldwide chauffeur service - Book a Ride On Time Every Time get 10% Off #holloween Trick or Treat https://t.co/MVZGVEiSeq #airporttransfer #dallasfortworth #limousineservice #newyorkcity #newyork #laxairport #philadelphia #manhattan #chauffeur https://t.co/pc2M6YFbS7"
## [3] "Various photos of #KeithRichards:\n 1) With #BrianJones and #AnitaPallenberg in #Morocco.\n\n2)A portrait from the #StickyFingers session.\n\n3) With #AndrewOldham in #Manhattan, #NY.\n\n#RollingStones #NewYork #Rock #ClassicRock #Music https://t.co/ZPM420FJXx https://t.co/JdbAAZWoMR"
my_twts$cleanedTxt <- gsub("http.*","", my_twts$text)
my_twts$cleanedTxt <- gsub("https.*","", my_twts$cleanedTxt)
my_twts$cleanedTxt <- gsub("amp*","", my_twts$cleanedTxt)
head(my_twts$cleanedTxt, 3)
## [1] "Look south on 7th and W 39th. Just another busy day in midtown Manhattan \n•\n<U+0001F4F7>: Don-ya' @photogbydonya \n•\n#manhattan #newyorkcity #photography #nyc #nycphotographer "
## [2] "Stay safe this #halloween with our Worldwide chauffeur service - Book a Ride On Time Every Time get 10% Off #holloween Trick or Treat "
## [3] "Various photos of #KeithRichards:\n 1) With #BrianJones and #AnitaPallenberg in #Morocco.\n\n2)A portrait from the #StickyFingers session.\n\n3) With #AndrewOldh in #Manhattan, #NY.\n\n#RollingStones #NewYork #Rock #ClassicRock #Music "
# We first remove punctuation, convert to lowercase, add id for each tweet!
my_twts_clean <- my_twts %>%
dplyr::select(cleanedTxt) %>%
unnest_tokens(word, cleanedTxt)
#Then we will check the number of rows after tokenization
nrow(my_twts_clean)
## [1] 1967
View(my_twts_clean)
# plot the top 15 words and sort them in order of their counts
my_twts_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "words",
y = "counts",
title = "Figure 2: Unique wordcounts found in tweets, with no stop words")
## Selecting by n
#-- Do you observe any problem?
# load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
head(stop_words)
## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
#View(stop_words)
The anti_join function is part of the tidytext package. It removes stop words from the tweet text and saved as cleaned tweet words.
# remove stop words from your list of words
cleanTokens <- my_twts_clean %>% anti_join(stop_words)
## Joining, by = "word"
# Check the number of rows after removal of the stop words. There should be fewer words now
nrow(cleanTokens)
## [1] 1294
# plot the top 10 words -- notice any issues?
cleanTokens %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(y = "count",
x = "words",
title = "Figure 3: Unique wordcounts found in tweets after applying stop words",
subtitle = "Stop words removed from the list")
## Selecting by n
###You may need these
# library(wordcloud2)
# library(RColorBrewer)
# pal <- brewer.pal(8,"Dark2")
#
# #Get some frequency counts for each word
# freq_df1 <- cleanTokens %>%
# count(word, sort = TRUE) %>%
# top_n(30) %>%
# mutate(word = reorder(word, n))
# wordcloud2(data = freq_df1, minRotation = 0, maxRotation = 0, ellipticity = 0.6)
###FURTHER CLEANING BASED ON YOUR EXPERTISE! YOUR CODE HERE
my_stopwords <- data.frame(c(stop_words$word, 'Manhattan','manhattan', 'New York', 'newyork','Newyork', 'NewYork', 'New', 'York', 'york', 'USA','nyc', 'NYC', 'newyorkcity', 'city','ny','NY','brooklyn')) ##adding stopwords
colnames(my_stopwords) <- "word"
cleanTokens2 <- my_twts_clean %>%
anti_join(my_stopwords)
## Joining, by = "word"
####Rereun the freq counts
freq_df2 <- cleanTokens2 %>%
count(word, sort = TRUE) %>%
top_n(50) %>%
mutate(word = reorder(word, n))
## Selecting by n
wordcloud2(data = freq_df2, minRotation = 0, maxRotation = 0, ellipticity = 0.6)
library(widyr)
#get ngrams. You may try playing around with the value of n, n=3 , n=4
my_twts_ngram <- my_twts %>%
dplyr::select(cleanedTxt) %>%
unnest_tokens(paired_words, cleanedTxt, token = "ngrams", n = 2)
#show ngrams with sorted values
my_twts_ngram %>%
count(paired_words, sort = TRUE)
## # A tibble: 1,333 x 2
## paired_words n
## <chr> <int>
## 1 get your 12
## 2 gospel brunch 12
## 3 new york 12
## 4 pilgrims gospel 12
## 5 the pilgrims 12
## 6 manhattan nyc 11
## 7 in manhattan 8
## 8 on the 8
## 9 check out 7
## 10 and earn 6
## # ... with 1,323 more rows
Here we see the ngrams are using stop words such as * a, to, etc.* Next we will try to obtain ngrams occurring without stop words. We will use the separate function of the tidyr library to obtain the paired words in two columns i.e. word 1 and word 2. Subsequently we filter out columns containing stop words using the filter fucntion
library(tidyr)
#separate the paired words into two columns
my_twts_ngram <- my_twts_ngram %>%
separate(paired_words, c("word1", "word2"), sep = " ")
# filter rows where there are stop words under word 1 column and word 2 column
my_twts_filtered <- my_twts_ngram %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# Sort the new bi-gram (n=2) counts:
my_words_counts <- my_twts_filtered %>%
count(word1, word2, sort = TRUE)
#head(my_words_counts)
# words occurring in pair after filtering out the stop words
head(my_twts_filtered)
## # A tibble: 6 x 2
## word1 word2
## <chr> <chr>
## 1 busy day
## 2 midtown manhattan
## 3 manhattan don
## 4 don ya
## 5 ya photogbydonya
## 6 photogbydonya manhattan
By using the igraph and ggraph library we are trying to visualize the words occuring in pairs. (Note the edges can’t be drawn at this time)
# plot word network
my_words_counts %>%
filter(n >= 2) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 4) +
labs(title = "Figure 5: Word Network: Tweets using my hashtag",
subtitle = "Text mining twitter data",
x = "", y = "")
End