library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(tm)
## Loading required package: NLP
# To tokenize
library(tidytext)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud2)
library(stringr)
# ggplot barchart arrangement
library(forcats)
# ggplot colors
library(paletteer)
## Warning: package 'paletteer' was built under R version 4.2.3
library(RColorBrewer)
airline <- read.csv("Tweets-1.csv")
# Create "hour" Variable
airline$tweet_created <- parse_datetime(airline$tweet_created, format = "%m/%d/%Y %H:%M")
airline$hour <- format(airline$tweet_created, format = "%H")
airline$hour <- as.numeric(airline$hour)
# Check the frequency of negative reasons
neg_air <- filter(airline, negativereason != "")
# Drop "Can't Tell" and blank negative reason categories
neg_air <- filter(airline, negativereason != "")
neg_air <- filter(neg_air, negativereason != "Can't Tell")
# Plot Florida sentiments
ggplot(airline, aes(x = airline_sentiment, fill = airline_sentiment)) + geom_bar() + facet_wrap(~airline) +
labs(title = "Airline Sentiments\n", x = "\nSentiments", y = "", fill = "Sentiments") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_manual(values = c("#D53E4FFF", "#FEE08BFF", "#ABDDA4FF"))

# Check the frequency of negative reasons
ggplot(neg_air, aes(y = negativereason, fill = negativereason)) +
geom_bar() +
scale_fill_paletteer_d("RColorBrewer::Spectral") +
labs(title = "Reasons for Negative Airline Tweets\n", x = "Frequency", y = "", fill = "Negative Reason")

# to predict which airline
airline_clean <- airline[,c("text","airline")]
# Create corpus
text_corpus <- Corpus(VectorSource(airline_clean$text))
text_corpus <- tm_map(text_corpus, PlainTextDocument)
## Warning in tm_map.SimpleCorpus(text_corpus, PlainTextDocument): transformation
## drops documents
text_corpus <- tm_map(text_corpus, tolower)
## Warning in tm_map.SimpleCorpus(text_corpus, tolower): transformation drops
## documents
text_corpus <- tm_map(text_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(text_corpus, removePunctuation): transformation
## drops documents
text_corpus <- tm_map(text_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(text_corpus, removeNumbers): transformation
## drops documents
text_corpus <- tm_map(text_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(text_corpus, stripWhitespace): transformation
## drops documents
# Remove stop words
unique(airline_clean$airline)
## [1] "Virgin America" "United" "Southwest" "Delta"
## [5] "US Airways" "American"
text_corpus <- tm_map(text_corpus, removeWords, c("virginamerica","jetblue", "united", "southwestair", "usairways", "americanair", "flight", stopwords("english")))
## Warning in tm_map.SimpleCorpus(text_corpus, removeWords, c("virginamerica", :
## transformation drops documents
# stem words
text_corpus <- tm_map(text_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(text_corpus, stemDocument): transformation drops
## documents
# Document term matrix
freq <- DocumentTermMatrix(text_corpus)
# Remove sparse argument in removeSparseTerms()
# 0.99 means we want to keep terms that appear in 1% or more in the tweets
sparse <- removeSparseTerms(freq, 0.995)
# Convert back to a dataframe
tweetsSparse <- as.data.frame(as.matrix(sparse))
# Rename as valid column names
colnames(tweetsSparse) = make.names(colnames(tweetsSparse))
# convert to factors
tweetsSparse$airline = as.factor(airline_clean$airline)
# Check the ratio of negative and positive tweets
prop.table(table(tweetsSparse$airline))
##
## American Delta Southwest United US Airways
## 0.18845628 0.15177596 0.16530055 0.26106557 0.19897541
## Virgin America
## 0.03442623
# Wordcloud
# Virgin America
unique(airline_clean$airline)
## [1] "Virgin America" "United" "Southwest" "Delta"
## [5] "US Airways" "American"
tweetsSparse_virgin <- tweetsSparse[tweetsSparse$airline == "Virgin America",]
tweetsSparse_v_Matrix <- as.matrix(tweetsSparse_virgin[, !names(tweetsSparse_virgin) %in% c("airline")])
freq_table_v <- sort(colSums(tweetsSparse_v_Matrix),decreasing=TRUE)
freq_df_v <- data.frame(word = names(freq_table_v), freq=freq_table_v)
wordcloud2(freq_df_v, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# United
tweetsSparse_united <- tweetsSparse[tweetsSparse$airline == "United",]
tweetsSparse_u_Matrix <- as.matrix(tweetsSparse_united[, !names(tweetsSparse_united) %in% c("airline")])
freq_table_u <- sort(colSums(tweetsSparse_u_Matrix),decreasing=TRUE)
freq_df_u <- data.frame(word = names(freq_table_u), freq=freq_table_u)
wordcloud2(freq_df_u, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# Southwest
tweetsSparse_Southwest <- tweetsSparse[tweetsSparse$airline == "Southwest",]
tweetsSparse_s_Matrix <- as.matrix(tweetsSparse_Southwest[, !names(tweetsSparse_Southwest) %in% c("airline")])
freq_table_s <- sort(colSums(tweetsSparse_s_Matrix),decreasing=TRUE)
freq_df_s <- data.frame(word = names(freq_table_s), freq=freq_table_s)
wordcloud2(freq_df_s, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# Delta
tweetsSparse_Delta <- tweetsSparse[tweetsSparse$airline == "Delta",]
tweetsSparse_d_Matrix <- as.matrix(tweetsSparse_Delta[, !names(tweetsSparse_Delta) %in% c("airline")])
freq_table_d <- sort(colSums(tweetsSparse_d_Matrix),decreasing=TRUE)
freq_df_d <- data.frame(word = names(freq_table_d), freq=freq_table_d)
wordcloud2(freq_df_d, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# Delta
tweetsSparse_Delta <- tweetsSparse[tweetsSparse$airline == "Delta",]
tweetsSparse_d_Matrix <- as.matrix(tweetsSparse_Delta[, !names(tweetsSparse_Delta) %in% c("airline")])
freq_table_d <- sort(colSums(tweetsSparse_d_Matrix),decreasing=TRUE)
freq_df_d <- data.frame(word = names(freq_table_d), freq=freq_table_d)
wordcloud2(freq_df_d, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# US Airways
tweetsSparse_US <- tweetsSparse[tweetsSparse$airline == "US Airways",]
tweetsSparse_u_Matrix <- as.matrix(tweetsSparse_US[, !names(tweetsSparse_US) %in% c("airline")])
freq_table_u <- sort(colSums(tweetsSparse_u_Matrix),decreasing=TRUE)
freq_df_u <- data.frame(word = names(freq_table_u), freq=freq_table_u)
wordcloud2(freq_df_u, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# American
tweetsSparse_American <- tweetsSparse[tweetsSparse$airline == "American",]
tweetsSparse_a_Matrix <- as.matrix(tweetsSparse_American[, !names(tweetsSparse_American) %in% c("airline")])
freq_table_a <- sort(colSums(tweetsSparse_a_Matrix),decreasing=TRUE)
freq_df_a <- data.frame(word = names(freq_table_a), freq=freq_table_a)
wordcloud2(freq_df_a, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# Start Hourly Analysis
# Plot airline sentiment by hour
ggplot(airline, aes(x = hour, fill = airline_sentiment)) +
geom_bar() +
facet_wrap(~airline) +
scale_fill_manual(values = c("#D53E4FFF", "#FEE08BFF", "#ABDDA4FF"))

# Plot negative reason by hour
ggplot(neg_air, aes(x = hour, fill = negativereason)) + geom_bar() + facet_wrap(~airline) +
scale_fill_paletteer_d("RColorBrewer::Spectral") +
labs(title = "Reason for Negative Airline Tweets by Hour", x = "", y = "", fill = "Negative Reason")

# Create a data frame only including tweet from florida
florida <- airline[str_detect(airline$tweet_location, " fl| FL| Fl| fL|florida|FLORIDA|Florida"),]
# Plot Florida sentiments
ggplot(florida, aes(x = airline_sentiment, fill = airline_sentiment)) + geom_bar() + facet_wrap(~airline) +
labs(title = "Airline Sentiments in Florida\n", x = "\nNegative Reasons", y = "", fill = "Negative Reasons") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_manual(values = c("#D53E4FFF", "#FEE08BFF", "#ABDDA4FF"))

# Create data frame that only has florida as the locaiton and negative reasons
florida_neg <- neg_air[str_detect(neg_air$tweet_location, " fl| FL| Fl| fL|florida|FLORIDA|Florida"),]
ggplot(florida_neg, aes(x = hour, fill = negativereason)) + geom_bar() + facet_wrap(~airline) +
scale_fill_paletteer_d("RColorBrewer::Spectral") +
labs(title = "Negative Airline Tweets in Florida\n", fill = "Negative Reasons", x ="\nHour (in 24hr Format)", y = "", fill = "")

ggplot(florida_neg, aes(x = negativereason, fill = negativereason)) + geom_bar() + facet_wrap(~airline) +
labs(title = "Negative Airline Tweets in Florida\n", fill = "Negative Reasons", x = "Negative Reasons", y = "") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_paletteer_d("RColorBrewer::Spectral")

# Get mentioned data
airline_lower <- tolower(airline$text)
airline_lower <- gsub("@virginamerica", "", airline_lower)
airline_lower <- gsub("@jetblue", "", airline_lower)
airline_lower <- gsub("@united", "", airline_lower)
airline_lower <- gsub("@southwestair", "", airline_lower)
airline_lower <- gsub("@delta", "", airline_lower)
airline_lower <- gsub("@usairways", "", airline_lower)
airline_lower <- gsub("@americanair", "", airline_lower)
airline$text_lower <- airline_lower
match <- str_detect(airline$text_lower, "@\\w+")
air_reduced <- airline[which(match),]
test <- regmatches(air_reduced$text_lower, regexpr("@\\w+", air_reduced$text_lower))
air_reduced$mention <- test
network <- air_reduced[,c("airline", "mention", "name", "airline_sentiment", "user_timezone", "hour", "retweet_count")]
# Check the number of each mentions
freq <- count(network$mention)
freq <- freq[order(freq$freq, decreasing = TRUE),]
names(freq) <- c("mention", "freq")
network <- merge(freq, network, by= "mention")
network <- network[order(network$freq, decreasing = TRUE),]
freq_reduc <- top_n(freq, 20, freq)
# Select rows that matches the top 30 mentioned entities
network_reduc <- network[network$mention %in% freq_reduc$mention,]
ggplot(network_reduc, aes(y = fct_rev(fct_infreq(mention)), fill = airline)) + geom_bar(stat = "count") +
geom_text(aes(x = freq, label = mention, size = freq, color = airline_sentiment), vjust = 0.2, hjust = -0.1) +
expand_limits(x = 60, y = c(-1, 20)+2) + scale_size(range = c(3,5)) +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
scale_fill_paletteer_d("LaCroixColoR::Lemon") +
scale_colour_paletteer_d("fishualize::Acanthurus_olivaceus") +
labs(title = "Top 20 Mentions in Airline Tweets\n", y = "Mentions\n", x = "\nMention Frequency", color = "Sentiment", fill = "Airline", size = "Mention Frequency")

ggplot(network_reduc, aes(y = fct_rev(fct_infreq(mention)), fill = airline_sentiment)) +
geom_bar(stat = "count") +
geom_text(aes(x = freq, label = mention, size = freq, color = airline_sentiment), vjust = 0.2, hjust = -0.1, alpha = 0.7) +
expand_limits(x = 60, y = c(-1, 20)+2) + scale_size(range = c(3,5)) +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
scale_fill_manual(values = c("#F7AA14FF", "#1BB6AFFF", "#172869FF")) +
scale_color_manual(values = c("#F7AA14FF", "#1BB6AFFF", "#172869FF")) +
labs(title = "Top 20 Mentions in Airline Tweets\n", y = "Mentions\n", x = "\nFrequency", color = "Sentiment", fill = "Sentiment", size = "Mention Frequency")
