ABC is a new airline company that operates in Europe. ABC wishes to expand their operations to US market. They have conduncted a market research and collected some surveys to find the current pain points of travellers in the US market. ABC would like to expierment with a sample of tweets they collected and test multiple hypothesis first. Later on, a larger dataset would be collected to help build their new market entering strategy. After this expermient, ABC would have a clear idea on current players in the market and what they lack.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library("wordcloud")
## Loading required package: RColorBrewer
library(reshape2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
setwd("~/Desktop/IE Course Work/3rd Semester/Innovation & Creating Processes")
tweets = read.csv('./Tweets.csv')
names(tweets)
## [1] "tweet_id" "airline_sentiment"
## [3] "airline_sentiment_confidence" "negativereason"
## [5] "negativereason_confidence" "airline"
## [7] "airline_sentiment_gold" "name"
## [9] "negativereason_gold" "retweet_count"
## [11] "text" "tweet_coord"
## [13] "tweet_created" "tweet_location"
## [15] "user_timezone"
summary(tweets)
## tweet_id airline_sentiment airline_sentiment_confidence
## Min. :5.676e+17 negative:9178 Min. :0.3350
## 1st Qu.:5.686e+17 neutral :3099 1st Qu.:0.6923
## Median :5.695e+17 positive:2363 Median :1.0000
## Mean :5.692e+17 Mean :0.9002
## 3rd Qu.:5.699e+17 3rd Qu.:1.0000
## Max. :5.703e+17 Max. :1.0000
##
## negativereason negativereason_confidence
## :5462 Min. :0.000
## Customer Service Issue:2910 1st Qu.:0.361
## Late Flight :1665 Median :0.671
## Can't Tell :1190 Mean :0.638
## Cancelled Flight : 847 3rd Qu.:1.000
## Lost Luggage : 724 Max. :1.000
## (Other) :1842 NA's :4118
## airline airline_sentiment_gold name
## American :2759 :14600 JetBlueNews: 63
## Delta :2222 negative: 32 kbosspotter: 32
## Southwest :2420 neutral : 3 _mhertz : 29
## United :3822 positive: 5 otisday : 28
## US Airways :2913 throthra : 27
## Virgin America: 504 rossj987 : 23
## (Other) :14438
## negativereason_gold retweet_count
## :14608 Min. : 0.00000
## Customer Service Issue : 12 1st Qu.: 0.00000
## Late Flight : 4 Median : 0.00000
## Can't Tell : 3 Mean : 0.08265
## Cancelled Flight : 3 3rd Qu.: 0.00000
## Cancelled Flight\nCustomer Service Issue: 2 Max. :44.00000
## (Other) : 8
## text tweet_coord
## @united thanks : 6 :13621
## @AmericanAir thanks : 5 [0.0, 0.0] : 164
## @JetBlue thanks! : 5 [40.64656067, -73.78334045]: 6
## @SouthwestAir sent : 5 [32.91792297, -97.00367737]: 3
## @AmericanAir thank you!: 4 [40.64646912, -73.79133606]: 3
## @united thank you! : 4 [18.22245647, -63.00369733]: 2
## (Other) :14611 (Other) : 841
## tweet_created tweet_location
## 2015-02-24 09:54:34 -0800: 5 :4733
## 2015-02-24 11:43:05 -0800: 4 Boston, MA : 157
## 2015-02-23 06:57:24 -0800: 3 New York, NY : 156
## 2015-02-23 10:58:58 -0800: 3 Washington, DC: 150
## 2015-02-23 14:18:58 -0800: 3 New York : 127
## 2015-02-23 15:25:46 -0800: 3 USA : 126
## (Other) :14619 (Other) :9191
## user_timezone
## :4820
## Eastern Time (US & Canada):3744
## Central Time (US & Canada):1931
## Pacific Time (US & Canada):1208
## Quito : 738
## Atlantic Time (Canada) : 497
## (Other) :1702
tweets$date <- as.Date(tweets$tweet_created)
posNeg = tweets %>% group_by(airline_sentiment) %>% dplyr::summarise(count = n())
posNeg
## Source: local data frame [3 x 2]
##
## airline_sentiment count
## (fctr) (int)
## 1 negative 9178
## 2 neutral 3099
## 3 positive 2363
ggplot(data=posNeg , aes(x="" , y=count , fill=airline_sentiment))+geom_bar(width=1,stat="identity")+
geom_text(aes(y = count/3 + c(0, cumsum(count)[-length(count)]),
label =count), size=4)+coord_polar("y") + scale_fill_manual(values=c("#ffbebe", "#fff9be", "#d4ffbe"))
We notice that more than half of tweets collected are negative. Lets dig deeper and see if negative percentage are more concentrated in certain airlines.
ggplot(tweets, aes(tweets$airline, fill=airline_sentiment)) + geom_bar()
airlines= tweets %>% group_by(airline) %>% dplyr::summarise(count=n())
posNegByAirline <-dcast(tweets, airline ~ airline_sentiment)
## Using date as value column: use value.var to override.
## Aggregation function missing: defaulting to length
posNegByAirline$negPer = posNegByAirline$negative / (posNegByAirline$negative + posNegByAirline$positive + posNegByAirline$neutral)
posNegByAirline = posNegByAirline[order(-posNegByAirline$negPer),]
posNegByAirline
## airline negative neutral positive negPer
## 5 US Airways 2263 381 269 0.7768623
## 1 American 1960 463 336 0.7104023
## 4 United 2633 697 492 0.6889063
## 3 Southwest 1186 664 570 0.4900826
## 2 Delta 955 723 544 0.4297930
## 6 Virgin America 181 171 152 0.3591270
We notice that for US Airways, American and United that percentage of negative tweets are more than 50% of the total tweets.
negativeTweets <- tweets %>% filter(airline_sentiment=="negative")
negativeTweetsByDate <- negativeTweets %>% group_by(date) %>% dplyr::summarise(count = n())
negativeTweetsByDatePlot = ggplot() + geom_line(data=negativeTweetsByDate, aes(x=date, y=count, group = 1))
negativeTweetsByDatePlot
February 22nd was the peak for negative tweets. To conclude whether that day was upnormal we should have a look at the distruption of negative tweets by airlines. to see if that it was a phonenma across many airlines.
negativeTweetsByDateByAirline <- negativeTweets %>% group_by(airline,date) %>% dplyr::summarise(count = n())
negativeTweetsByDateByAirlinePlot = ggplot() + geom_line(data=negativeTweetsByDateByAirline, aes(x=date, y=count, group =airline , color=airline))
negativeTweetsByDateByAirlinePlot
We notice a general increase in negative tweets in 22nd Feb. However, the sudden spike of negative tweets from almost zero to 700 for american airlines looks suspicious. It might be a data quality issue and it might be normal. This needs further investegation.
negativeReasonTweets <- negativeTweets %>% group_by(negativereason) %>% dplyr::summarise(count=n()) %>% arrange(desc(count))
negativeReasonTweets
## Source: local data frame [10 x 2]
##
## negativereason count
## (fctr) (int)
## 1 Customer Service Issue 2910
## 2 Late Flight 1665
## 3 Can't Tell 1190
## 4 Cancelled Flight 847
## 5 Lost Luggage 724
## 6 Bad Flight 580
## 7 Flight Booking Problems 529
## 8 Flight Attendant Complaints 481
## 9 longlines 178
## 10 Damaged Luggage 74
We notice that top two reasons for negative tweets are Customer Service issues and late flights. But this might be different for each airline.
negativeReasonTweetsByAirline <- negativeTweets %>% group_by(airline,negativereason) %>% dplyr::summarise(count=n())%>% arrange(airline,desc(count))
negativeReasonTweetsByAirline
## Source: local data frame [60 x 3]
## Groups: airline [6]
##
## airline negativereason count
## (fctr) (fctr) (int)
## 1 American Customer Service Issue 768
## 2 American Late Flight 249
## 3 American Cancelled Flight 246
## 4 American Can't Tell 198
## 5 American Lost Luggage 149
## 6 American Flight Booking Problems 130
## 7 American Bad Flight 87
## 8 American Flight Attendant Complaints 87
## 9 American longlines 34
## 10 American Damaged Luggage 12
## .. ... ... ...
and heres the top reason for each airline. We notice that “Customer Service Issue” is the main reason for negative tweets except for delta where its late flights.
topReasonPerAirline <- ddply(negativeReasonTweetsByAirline, "airline", function(z) head(z,1))
topReasonPerAirline
## airline negativereason count
## 1 American Customer Service Issue 768
## 2 Delta Late Flight 269
## 3 Southwest Customer Service Issue 391
## 4 United Customer Service Issue 681
## 5 US Airways Customer Service Issue 811
## 6 Virgin America Customer Service Issue 60
docs <- Corpus(VectorSource(tweets$text))
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords, c("usairways" ,"united", "flight" , "americanair" , "jetblue" , "southwestair"))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## get get 1338
## can can 1108
## thanks thanks 1073
## cancelled cancelled 1056
## now now 1024
## just just 966
## service service 956
## help help 858
## time time 784
## will will 763
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
docs2 <- Corpus(VectorSource(negativeTweets$text))
docs2 <- tm_map(docs2, content_transformer(tolower))
docs2 <- tm_map(docs2, removeNumbers)
docs2 <- tm_map(docs2, removeWords, stopwords("english"))
docs2 <- tm_map(docs2, removeWords, c("usairways" ,"united", "flight" , "americanair" , "jetblue" , "southwestair"))
docs2 <- tm_map(docs2, removePunctuation)
docs2 <- tm_map(docs2, stripWhitespace)
dtm2 <- TermDocumentMatrix(docs2)
m2 <- as.matrix(dtm2)
v2 <- sort(rowSums(m2),decreasing=TRUE)
d2 <- data.frame(word = names(v2),freq=v2)
head(d2, 10)
## word freq
## get get 986
## cancelled cancelled 920
## now now 824
## service service 741
## hours hours 656
## can can 624
## just just 616
## help help 613
## hold hold 612
## customer customer 605
wordcloud(words = d2$word, freq = d2$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
For a full EDA and to fullfil our goal which is the entering market startegy we need the following, which can’t be obtained from twitter data unless we do text mining on the tweets and we try to extract those features from the tweets. It should be useful to see if the customer care account at the airlines responded to those tweet asking for more information like flight Number, time, date ..etc.
Missing Data (Areas of improvements): 1- Directions of the flight: Customer complaints might be related to certain airports. 2- Price& Category of airlines: 3- type of traveller: Any information that can be obtained about the twitter profile.