Installing relevant packages and setting WD
packages <- c("rtweet", "ggplot2", "dplyr", "tidytext", "tidyverse", "igraph", "ggraph", "tidyr", "wordcloud2", "textdata")
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Load packages
invisible(lapply(packages, library, character.only = TRUE))
data <- read.csv("C:\\Users\\cod\\Desktop\\PhD Files\\Course Works\\2nd Year Falls Semester\\Urban Analytics\\Assignment\\Assig6updated\\Assignment6_Twitter\\lifeboatfiles\\park_tweets_all.csv")
When we retrieved data, what are two major elements we need to put in the “search_tweets()” call?
To be able to retrieve data from Twitter using search_tweets(), we need to input a keyword we are interested in and the number of tweets we want to pull from Twitter.
What is the difference between screen_name and name? (Search for someone’s screen_name and name and give an example.) What is their location, and did they write this or did the GPS catch it?
Screen names starts with @ and it cannot have a space and the name can have space. Example of screen names is @ProvidenceAdu and the name is Providence Adu. His location is Charlotte, NC. He wrote this.
Downloading Twitter Data
appname <- "mytwitterapp"
twitter_token <- create_token(
app = appname,
consumer_key = "GHgjnXIex7qjWHly8MTjhi4Bv",
consumer_secret = "GFPIOETMiBobEeEMil3mdv7ESvZN2vWg2nQ9v0BAps4HOfe6pe",
access_token = "475638518-9CvROETaYyp9lEGaumxFAk9vCUY8ShGi5dEtotVn",
access_secret = "FxkHNzvf9ixqWF6TIT5hDoyW29d9TQK8ts1Ywb4LR6bnX")
park_tweets_all <- search_tweets(q = "piedmont park", n = 200)
#For something more managable:
park_tweets <- park_tweets_all[, sapply(park_tweets_all, Negate(anyNA)), drop = FALSE]
#show headers of the tweets downloaded from piedmont park
head(park_tweets, n = 5)
## # A tibble: 5 x 28
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 1343960549693005828 145382601~ 2021-10-28 20:48:27 mireya_rena~ "El Ja~ Twitt~
## 2 19680156 145381004~ 2021-10-28 19:45:00 ParkTavern "We're~ Hoots~
## 3 17836882 145380631~ 2021-10-28 19:30:10 FOX5Atlanta "PIEDM~ True ~
## 4 17836882 145368049~ 2021-10-28 11:10:13 FOX5Atlanta "PIEDM~ True ~
## 5 1440055025389113344 145379782~ 2021-10-28 18:56:27 AlvaLucina "El Ja~ Twitt~
## # ... with 22 more variables: display_text_width <dbl>, is_quote <lgl>,
## # is_retweet <lgl>, favorite_count <int>, retweet_count <int>, lang <chr>,
## # geo_coords <list>, coords_coords <list>, bbox_coords <list>,
## # status_url <chr>, name <chr>, location <chr>, description <chr>,
## # protected <lgl>, followers_count <int>, friends_count <int>,
## # listed_count <int>, statuses_count <int>, favourites_count <int>,
## # account_created_at <dttm>, verified <lgl>, profile_image_url <chr>
On your own: What percentage of the tweets are retweets? What percentage have geo_coords?
The data I downloaded from Twitter after class has only one geo.coords tweets, this represents \((1/200)*100\) \(=\) \(0.5\)\(\%\) and percentage of retweet is \((166/200) *100\) \(=\) \(83\)\(\%\).
Looking at the data you added to the assignment folder number of tweets with geo.coords represent \((6/200)* 100\) \(=\) \(3\)\(\%\) and percentage of retweet is \((97/200)*100\) \(=\) \(48.5\)\(\%\).
d.frame <- as.data.frame(park_tweets$geo_coords)
datmat <- do.call(rbind, d.frame)
datdf<- as.data.frame(datmat, row.names = F)
colnames(datdf)[1:2] <- c("xcoord", "ycoord")
table(datdf)
## ycoord
## xcoord -84.37439919
## 33.78499712 1
Number of retweets from the tweets pull from Twitter after class
d.frame <- as.data.frame(park_tweets$is_retweet)
datmat <- do.call(cbind, d.frame)
datdf<- as.data.frame(datmat, row.names = F)
label <- factor(datdf$`park_tweets$is_retweet`, levels = c(TRUE, FALSE), labels = c("retweet","not_retweet"))
datdf.table <- table(as.data.frame(label))
d.frame.table <- as.data.frame(datdf.table)
mutate(d.frame.table, percent = Freq/200 *100 )
## Var1 Freq percent
## 1 retweet 166 83
## 2 not_retweet 34 17
Number of geocoded tweets from data included in the assignment folder
table.data <- cbind(table(data$geo_coords));table.data
## [,1]
## 33.7566 -84.3889 1
## 33.78231294 -84.36949884 1
## 33.78499712 -84.37439919 2
## 33.786519 -84.373762 1
## 33.78965774 -84.37408352 1
## NA NA 194
Number of retweets from data included in the assignment folder
label <- factor(data$is_retweet, levels = c(TRUE, FALSE), labels = c("retweet","not_retweet"))
datdf.table <- table(as.data.frame(label))
d.frame.table <- as.data.frame(datdf.table)
mutate(d.frame.table, percent = Freq/200 *100 )
## Var1 Freq percent
## 1 retweet 97 48.5
## 2 not_retweet 103 51.5
On your own: What are the three most popular sources for the tweets and how many tweets and how many tweets come from each? Why would this information be useful to a developer or to an urban analytics researcher?
The three most popular sources includes Twitter for Android, Twitter for Iphone, Twitter for Web App. See the table below for the number of tweets from each of the sources. A developer will be interested in knowing the popular sources for tweets, so that they could consider and design their products/app. to be compatible with these sources.
a<-data.frame(table(park_tweets$source))
b<-data.frame(table(data$source))
colnames(b)[2]<-"Frequency"
join.table <-full_join(a, b)
## Joining, by = "Var1"
colnames(join.table)[2:3] <- c("Data.out.Cl", "Dat.in.Cl")
arrange(join.table, desc(Data.out.Cl))
## Var1 Data.out.Cl Dat.in.Cl
## 1 Twitter for Android 139 42
## 2 Twitter for iPhone 25 87
## 3 Twitter Web App 24 30
## 4 Hootsuite Inc. 4 2
## 5 IFTTT 2 NA
## 6 True Anthem 2 3
## 7 TweetDeck 2 1
## 8 Instagram 1 8
## 9 Twitter for iPad 1 2
## 10 dlvr.it NA 1
## 11 Echofon NA 1
## 12 Emplifi NA 1
## 13 Heropost NA 2
## 14 Mailchimp NA 1
## 15 Microsoft Power Platform NA 1
## 16 Post Studio NA 1
## 17 PulpNews NA 2
## 18 SocialBee.io v2 NA 1
## 19 SocialFlow NA 4
## 20 SocialNewsDesk NA 7
## 21 Sprinklr Publishing NA 1
## 22 Tweetlogix NA 1
## 23 WordPress.com NA 1
#This line of codes are not the neatest but it gets the job done
users <- park_tweets %>% select(screen_name, location) %>% group_by(screen_name) %>% summarise(first(location)) %>% rename(location = `first(location)`)
userlocations <- data.frame(table(users$location))
char <- levels(userlocations$Var1)
char
## [1] "" " Down 2 earth"
## [3] "@scinternacional" "1144 Piedmont Avenue, Atlanta"
## [5] "16" "500 10th Street NE, ATL 30309"
## [7] "6th Layer of the Abyss" "706 — 404 <U+0001F4CC>"
## [9] "Asuncion, Paraguay" "ATL"
## [11] "ATL, GA" "Atlanta"
## [13] "Atlanta Georgia" "Atlanta, GA"
## [15] "Atlanta, GA, USA" "Atlanta, Georgia "
## [17] "Bella Noches" "BornBaptizedRaised inHLP90042 "
## [19] "Chino, CA" "Ciudad Autónoma de Buenos Aire"
## [21] "Coacalco de Berriozábal, Méxic" "Ecatepec de Morelos, México"
## [23] "Everywhere" "Farmington Hills, MI"
## [25] "Georgia" "Georgia, USA"
## [27] "Irvine, CA" "Knoxville, TN"
## [29] "Los Angeles, CA" "Michigan"
## [31] "Nezahualcóyotl, México" "Nowhere"
## [33] "Oakland, CA" "Richmond, VA"
## [35] "rockaway" "Salisbury, NC"
## [37] "Sinaloa, México" "Temascalcingo,Mexico"
## [39] "Texcoco, México" "THE GOON PALACE, NYC"
## [41] "Tultepec, México" "USA, Japan, Taiwan, KSA"
label <- c("No Location","Down 2 earth","@scinternacional","Atlanta","16","Atlanta","6th Layer of the Abyss","706 — 404 📌","Paraguay","Atlanta","Atlanta","Atlanta",
"Atlanta","Atlanta","Atlanta","Atlanta","Bella Noches","BornBaptizedRaised inHLP90042", "California","Argentina", "Mexico", "Mexico", "Everywhere", "Michigan", "Georgia", "Georgia",
"California", "TN", "California", "Michigan", "Mexico", "Nowhere", "California", "VA", "Rockway", "NC", "Mexico", "Mexico", "Mexico", "New York", "Mexido", "KSA")
rename <- factor(userlocations$Var1, levels = char, labels = label)
pr.data<-cbind(userlocations, rename)
agg <- aggregate(pr.data[,"Freq"],by = list(pr.data$rename), FUN = sum)
ggplot(data = agg, mapping = aes(x = reorder(Group.1, x), y = x))+
geom_col(aes(fill = Group.1),show.legend = F)+ labs(title = "Frequent Location in Tweets", x = "Location",
y = "Frequency")+ scale_fill_brewer(palette = "Paired")+
scale_y_continuous(breaks = scales::pretty_breaks())+
xlim("Atlanta","Mexico","Georgia","California","Michigan", "New York","Paraguay")
## Warning: Removed 17 rows containing missing values (position_stack).
What does gsub do? And what is the syntax of it? gsub(what goes first, what goes second, what goes third)
Gsub perform replacement of the first and all matches respectively. The first thing in gsub is the pattern or the string which you want to be replaced, the second thing is the input string to replace the pattern string and the third thing is the vector or a data frame to replace the strings.
Your own keyword
?gsub
## starting httpd help server ... done
my_twts <- search_tweets(q = "##lyft", n = 100,
lang = "en",
include_rts = FALSE)
#save the tweets downloaded using rtweet as a csv
#write_as_csv(my_twts, "my_twts.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
head(my_twts$text,3)
## [1] "Been waiting on a #LYFT for 40 mins. Can not wait to get this car man. <U+0001F629>"
## [2] "One thing I hate about #uber is that there is never a damn driver near me when I use it but #Lyft always got a driver 6 mins away"
## [3] "HyreCar is the leading car sharing marketplace where rideshare #drivers can rent #vehicles to drive for companies such as #Uber and #Lyft. They aim to become a major leader in automotive asset sharing.\nhttps://t.co/MapLZROHfv"
my_twts$cleanedTxt <- gsub("http.*","", my_twts$text)
my_twts$cleanedTxt <- gsub("https.*","", my_twts$cleanedTxt)
my_twts$cleanedTxt <- gsub("amp*","", my_twts$cleanedTxt)
head(my_twts$cleanedTxt, 3)
## [1] "Been waiting on a #LYFT for 40 mins. Can not wait to get this car man. <U+0001F629>"
## [2] "One thing I hate about #uber is that there is never a dn driver near me when I use it but #Lyft always got a driver 6 mins away"
## [3] "HyreCar is the leading car sharing marketplace where rideshare #drivers can rent #vehicles to drive for companies such as #Uber and #Lyft. They aim to become a major leader in automotive asset sharing.\n"
How many tokens do you get back? What is something we should be cautious of with tokens? (And how to we solve it—we do so at the end of the lab).
We got 2,639 (This changes everything I knit the code so please check the result original.data below) tokens back. Since words go together and need a context, we need to be caution with the tokens because they lack context and as such do not provide a lot of information. We solved this problem by using stop word to filter out common words.
What is the problem in your bar chart called unique word-counts found it tweets (Not your Atlanta/Location bar chart), and how did you solve it?
The words displayed on the bar charts are common words which lack context and as such do not provide a lot of information. We solved this problem by using stop word to filter out common words.
How many keywords did you originally have and how many keywords did you have after you applied the stop words?
The original data set had original.data (2,639) keywords and reduced to process.data (1,388) after applying the stop words. check below for the count as it changes every time I knit the code
Tokenization
# We first remove punctuation, convert to lowercase, add id for each tweet!
my_twts_clean <- my_twts %>%
dplyr::select(cleanedTxt) %>%
unnest_tokens(word, cleanedTxt)
#Then we will check the number of rows after tokenization
original.data <- nrow(my_twts_clean); original.data
## [1] 2639
#View(my_twts_clean)
# plot the top 15 words and sort them in order of their counts
my_twts_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "words",
y = "counts",
title = "Figure 2: Unique wordcounts found in tweets, with no stop words")
## Selecting by n
#-- Do you observe any problem?
Part 3: Stop Words
# load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
# remove stop words from your list of words
cleanTokens <- my_twts_clean %>% anti_join(stop_words)
## Joining, by = "word"
# Check the number of rows after removal of the stop words. There should be fewer words now
process.data <- nrow(cleanTokens); process.data
## [1] 1388
# plot the top 15 words -- notice any issues?
cleanTokens %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(y = "count",
x = "words",
title = "Figure 3: Unique wordcounts found in tweets after applying stop words",
subtitle = "Stop words removed from the list")
## Selecting by n
Part 4: Wordclouds
###You may need these
#library(wordcloud)
library(RColorBrewer)
pal <- brewer.pal(8,"Dark2")
#Get some frequency counts for each word
#freq_df1 <- cleanTokens %>%
#count(word, sort = TRUE) %>%
#top_n(30) %>%
#mutate(word = reorder(word, n))
#wordcloud2(data = freq_df1, minRotation = 0, maxRotation = 0, ellipticity = 0.6)
Which NEW stop words did you add and why?
I used stop words like Uber and Lyft because I wanted to remove the name of these companies so that words such as riders,rideshare, car, taxi, among others could standout in the world cloud. Also, I used drivers as stop words, because I did not want driver and drivers to appear together in the word cloud.
###FURTHER CLEANING BASED ON YOUR EXPERTISE! YOUR CODE HERE
my_stopwords <- data.frame(c(stop_words$word, 'uber', 'lyft','drivers'))
colnames(my_stopwords) <- "word"
cleanTokens2 <- my_twts_clean %>%
anti_join(my_stopwords)
## Joining, by = "word"
####Rereun the freq counts
freq_df2 <- cleanTokens2 %>%
count(word, sort = TRUE) %>%
top_n(50) %>%
mutate(word = reorder(word, n))
## Selecting by n
wordcloud2(data = freq_df2, minRotation = 0, maxRotation = 0, ellipticity = 0.6)
Part 5: N-grams
#n-grams is more than two words linked together
#install.packages("widyr")
library(widyr)
#get ngrams. You may try playing around with the value of n, n=3 , n=4
my_twts_ngram <- my_twts %>%
dplyr::select(cleanedTxt) %>%
unnest_tokens(paired_words, cleanedTxt, token = "ngrams", n = 2)
#show ngrams with sorted values
my_twts_ngram %>%
count(paired_words, sort = TRUE)
## # A tibble: 2,100 x 2
## paired_words n
## <chr> <int>
## 1 uber lyft 22
## 2 lyft driver 8
## 3 and lyft 7
## 4 i have 7
## 5 lyft uber 7
## 6 uber and 7
## 7 allstate lyft 6
## 8 and other 6
## 9 in a 6
## 10 the driver 6
## # ... with 2,090 more rows
library(tidyr)
#separate the paired words into two columns
my_twts_ngram <- my_twts_ngram %>%
separate(paired_words, c("word1", "word2"), sep = " ")
# filter rows where there are stop words under word 1 column and word 2 column
my_twts_filtered <- my_twts_ngram %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# Sort the new bi-gram (n=2) counts:
my_words_counts <- my_twts_filtered %>%
count(word1, word2, sort = TRUE)
#head(my_words_counts)
# words occurring in pair after filtering out the stop words
head(my_twts_filtered)
## # A tibble: 6 x 2
## word1 word2
## <chr> <chr>
## 1 40 mins
## 2 dn driver
## 3 driver 6
## 4 6 mins
## 5 leading car
## 6 car sharing
# plot word network
my_words_counts %>%
filter(n >= 2) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 4) +
labs(title = "Figure 5: Word Network: Tweets using ##lyft",
subtitle = "Text mining twitter data",
x = "", y = "")
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
What did you learn about your keyword? Please make this an insightful comment. Really dig in and reflect on what you learned.
The ##lyft as the keyword. The goal was to find the tweets related to Lyft as a company. I realized Lyft was the highest unique wordcounts, followed by Uber. This is not surprising as both are different companies but provide the same services with the same business model. Also, form the ngram words like driver passenger, and injuries are linked.