library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(tm)
## Loading required package: NLP
# To tokenize
library(tidytext)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud2)
library(stringr)
# ggplot barchart arrangement
library(forcats)
# ggplot colors 
library(paletteer)
## Warning: package 'paletteer' was built under R version 4.2.3
library(RColorBrewer)

airline <- read.csv("Tweets-1.csv")

# Create "hour" Variable
airline$tweet_created <- parse_datetime(airline$tweet_created, format = "%m/%d/%Y %H:%M")

airline$hour <- format(airline$tweet_created, format = "%H")
airline$hour <- as.numeric(airline$hour)

# Check the frequency of negative reasons
neg_air <- filter(airline, negativereason != "")

# Drop "Can't Tell" and blank negative reason categories
neg_air <- filter(airline, negativereason != "")
neg_air <- filter(neg_air, negativereason != "Can't Tell")

# Plot Florida sentiments
ggplot(airline, aes(x = airline_sentiment, fill = airline_sentiment)) + geom_bar() + facet_wrap(~airline) + 
  labs(title = "Airline Sentiments\n", x = "\nSentiments", y = "", fill = "Sentiments") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + 
  scale_fill_manual(values = c("#D53E4FFF", "#FEE08BFF", "#ABDDA4FF"))

# Check the frequency of negative reasons
ggplot(neg_air, aes(y = negativereason, fill = negativereason)) + 
  geom_bar() +
  scale_fill_paletteer_d("RColorBrewer::Spectral") + 
  labs(title = "Reasons for Negative Airline Tweets\n", x = "Frequency", y = "", fill = "Negative Reason")

# to predict which airline
airline_clean <- airline[,c("text","airline")]

# Create corpus
text_corpus <- Corpus(VectorSource(airline_clean$text))
text_corpus <- tm_map(text_corpus, PlainTextDocument)
## Warning in tm_map.SimpleCorpus(text_corpus, PlainTextDocument): transformation
## drops documents
text_corpus <- tm_map(text_corpus, tolower)
## Warning in tm_map.SimpleCorpus(text_corpus, tolower): transformation drops
## documents
text_corpus <- tm_map(text_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(text_corpus, removePunctuation): transformation
## drops documents
text_corpus <- tm_map(text_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(text_corpus, removeNumbers): transformation
## drops documents
text_corpus <- tm_map(text_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(text_corpus, stripWhitespace): transformation
## drops documents
# Remove stop words
unique(airline_clean$airline)
## [1] "Virgin America" "United"         "Southwest"      "Delta"         
## [5] "US Airways"     "American"
text_corpus <- tm_map(text_corpus, removeWords, c("virginamerica","jetblue", "united", "southwestair", "usairways", "americanair", "flight", stopwords("english")))
## Warning in tm_map.SimpleCorpus(text_corpus, removeWords, c("virginamerica", :
## transformation drops documents
# stem words
text_corpus <- tm_map(text_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(text_corpus, stemDocument): transformation drops
## documents
# Document term matrix
freq <- DocumentTermMatrix(text_corpus)
# Remove sparse argument in removeSparseTerms()
# 0.99 means we want to keep terms that appear in 1% or more in the tweets
sparse <- removeSparseTerms(freq, 0.995)
# Convert back to a dataframe
tweetsSparse <- as.data.frame(as.matrix(sparse))
# Rename as valid column names
colnames(tweetsSparse) = make.names(colnames(tweetsSparse))

# convert to factors
tweetsSparse$airline = as.factor(airline_clean$airline)

# Check the ratio of negative and positive tweets 
prop.table(table(tweetsSparse$airline)) 
## 
##       American          Delta      Southwest         United     US Airways 
##     0.18845628     0.15177596     0.16530055     0.26106557     0.19897541 
## Virgin America 
##     0.03442623
# Wordcloud

# Virgin America
unique(airline_clean$airline)
## [1] "Virgin America" "United"         "Southwest"      "Delta"         
## [5] "US Airways"     "American"
tweetsSparse_virgin <- tweetsSparse[tweetsSparse$airline == "Virgin America",]
tweetsSparse_v_Matrix <- as.matrix(tweetsSparse_virgin[, !names(tweetsSparse_virgin) %in% c("airline")])
freq_table_v <- sort(colSums(tweetsSparse_v_Matrix),decreasing=TRUE)
freq_df_v <- data.frame(word = names(freq_table_v), freq=freq_table_v)

wordcloud2(freq_df_v, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# United
tweetsSparse_united <- tweetsSparse[tweetsSparse$airline == "United",]
tweetsSparse_u_Matrix <- as.matrix(tweetsSparse_united[, !names(tweetsSparse_united) %in% c("airline")])
freq_table_u <- sort(colSums(tweetsSparse_u_Matrix),decreasing=TRUE)
freq_df_u <- data.frame(word = names(freq_table_u), freq=freq_table_u)

wordcloud2(freq_df_u, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# Southwest
tweetsSparse_Southwest <- tweetsSparse[tweetsSparse$airline == "Southwest",]
tweetsSparse_s_Matrix <- as.matrix(tweetsSparse_Southwest[, !names(tweetsSparse_Southwest) %in% c("airline")])
freq_table_s <- sort(colSums(tweetsSparse_s_Matrix),decreasing=TRUE)
freq_df_s <- data.frame(word = names(freq_table_s), freq=freq_table_s)

wordcloud2(freq_df_s, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# Delta
tweetsSparse_Delta <- tweetsSparse[tweetsSparse$airline == "Delta",]
tweetsSparse_d_Matrix <- as.matrix(tweetsSparse_Delta[, !names(tweetsSparse_Delta) %in% c("airline")])
freq_table_d <- sort(colSums(tweetsSparse_d_Matrix),decreasing=TRUE)
freq_df_d <- data.frame(word = names(freq_table_d), freq=freq_table_d)

wordcloud2(freq_df_d, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# Delta
tweetsSparse_Delta <- tweetsSparse[tweetsSparse$airline == "Delta",]
tweetsSparse_d_Matrix <- as.matrix(tweetsSparse_Delta[, !names(tweetsSparse_Delta) %in% c("airline")])
freq_table_d <- sort(colSums(tweetsSparse_d_Matrix),decreasing=TRUE)
freq_df_d <- data.frame(word = names(freq_table_d), freq=freq_table_d)

wordcloud2(freq_df_d, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# US Airways
tweetsSparse_US <- tweetsSparse[tweetsSparse$airline == "US Airways",]
tweetsSparse_u_Matrix <- as.matrix(tweetsSparse_US[, !names(tweetsSparse_US) %in% c("airline")])
freq_table_u <- sort(colSums(tweetsSparse_u_Matrix),decreasing=TRUE)
freq_df_u <- data.frame(word = names(freq_table_u), freq=freq_table_u)

wordcloud2(freq_df_u, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# American
tweetsSparse_American <- tweetsSparse[tweetsSparse$airline == "American",]
tweetsSparse_a_Matrix <- as.matrix(tweetsSparse_American[, !names(tweetsSparse_American) %in% c("airline")])
freq_table_a <- sort(colSums(tweetsSparse_a_Matrix),decreasing=TRUE)
freq_df_a <- data.frame(word = names(freq_table_a), freq=freq_table_a)

wordcloud2(freq_df_a, size = 5, minSize = 2, color = brewer.pal(8, "Dark2"))
# Start Hourly Analysis
# Plot airline sentiment by hour
ggplot(airline, aes(x = hour, fill = airline_sentiment)) + 
  geom_bar() + 
  facet_wrap(~airline) + 
  scale_fill_manual(values = c("#D53E4FFF", "#FEE08BFF", "#ABDDA4FF"))

# Plot negative reason by hour
ggplot(neg_air, aes(x = hour, fill = negativereason)) + geom_bar() + facet_wrap(~airline) +
  scale_fill_paletteer_d("RColorBrewer::Spectral") + 
  labs(title = "Reason for Negative Airline Tweets by Hour", x = "", y = "", fill = "Negative Reason")

# Create a data frame only including tweet from florida
florida <- airline[str_detect(airline$tweet_location, " fl| FL| Fl| fL|florida|FLORIDA|Florida"),]

# Plot Florida sentiments
ggplot(florida, aes(x = airline_sentiment, fill = airline_sentiment)) + geom_bar() + facet_wrap(~airline) + 
  labs(title = "Airline Sentiments in Florida\n", x = "\nNegative Reasons", y = "", fill = "Negative Reasons") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + 
  scale_fill_manual(values = c("#D53E4FFF", "#FEE08BFF", "#ABDDA4FF"))

# Create data frame that only has florida as the locaiton and negative reasons
florida_neg <- neg_air[str_detect(neg_air$tweet_location, " fl| FL| Fl| fL|florida|FLORIDA|Florida"),]

ggplot(florida_neg, aes(x = hour, fill = negativereason)) + geom_bar() + facet_wrap(~airline) +
  scale_fill_paletteer_d("RColorBrewer::Spectral") + 
  labs(title = "Negative Airline Tweets in Florida\n", fill = "Negative Reasons", x ="\nHour (in 24hr Format)", y = "", fill = "")

ggplot(florida_neg, aes(x = negativereason, fill = negativereason)) + geom_bar() + facet_wrap(~airline) + 
  labs(title = "Negative Airline Tweets in Florida\n", fill = "Negative Reasons", x = "Negative Reasons", y = "") +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  scale_fill_paletteer_d("RColorBrewer::Spectral")

# Get mentioned data
airline_lower <- tolower(airline$text)
airline_lower <- gsub("@virginamerica", "", airline_lower)
airline_lower <- gsub("@jetblue", "", airline_lower)
airline_lower <- gsub("@united", "", airline_lower)
airline_lower <- gsub("@southwestair", "", airline_lower)
airline_lower <- gsub("@delta", "", airline_lower)
airline_lower <- gsub("@usairways", "", airline_lower) 
airline_lower <- gsub("@americanair", "", airline_lower) 
airline$text_lower <- airline_lower

match <- str_detect(airline$text_lower, "@\\w+")
air_reduced <- airline[which(match),]

test <- regmatches(air_reduced$text_lower, regexpr("@\\w+", air_reduced$text_lower))

air_reduced$mention <- test

network <- air_reduced[,c("airline", "mention", "name", "airline_sentiment", "user_timezone", "hour", "retweet_count")]

# Check the number of each mentions
freq <- count(network$mention)
freq <- freq[order(freq$freq, decreasing = TRUE),]
names(freq) <- c("mention", "freq")
network <- merge(freq, network, by= "mention")
network <- network[order(network$freq, decreasing = TRUE),]
freq_reduc <- top_n(freq, 20, freq)
# Select rows that matches the top 30 mentioned entities
network_reduc <- network[network$mention %in% freq_reduc$mention,]


ggplot(network_reduc, aes(y = fct_rev(fct_infreq(mention)), fill = airline)) + geom_bar(stat = "count") + 
  geom_text(aes(x = freq, label = mention, size = freq, color = airline_sentiment), vjust = 0.2, hjust = -0.1) + 
  expand_limits(x = 60, y = c(-1, 20)+2) + scale_size(range = c(3,5)) +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  scale_fill_paletteer_d("LaCroixColoR::Lemon") + 
  scale_colour_paletteer_d("fishualize::Acanthurus_olivaceus") +
  labs(title = "Top 20 Mentions in Airline Tweets\n", y = "Mentions\n", x = "\nMention Frequency", color = "Sentiment", fill = "Airline", size = "Mention Frequency")

ggplot(network_reduc, aes(y = fct_rev(fct_infreq(mention)), fill = airline_sentiment)) + 
  geom_bar(stat = "count") + 
  geom_text(aes(x = freq, label = mention, size = freq, color = airline_sentiment), vjust = 0.2, hjust = -0.1, alpha = 0.7) +
  expand_limits(x = 60, y = c(-1, 20)+2) + scale_size(range = c(3,5)) +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) + 
  scale_fill_manual(values = c("#F7AA14FF", "#1BB6AFFF", "#172869FF")) + 
  scale_color_manual(values = c("#F7AA14FF", "#1BB6AFFF", "#172869FF"))  +
  labs(title = "Top 20 Mentions in Airline Tweets\n", y = "Mentions\n", x = "\nFrequency", color = "Sentiment", fill = "Sentiment", size = "Mention Frequency")