Import cleaned dataset
df <- read.csv("flight_survey_updated.csv", stringsAsFactors=T)
df$satisfaction <- factor(df$satisfaction)
df$year_first_flight <- factor(df$year_first_flight)
df$num_loyalty_cards <- factor(df$num_loyalty_cards)
Summary of anlayis
- Higher ratings are centered around 40 year old customers
- Older customers (80 and above) and younger customers (20 and below) give lower ratings
- Cheapseats is assumed to be the busiest airline (most flights); they have the most observations in the dataset
- Cool&Young is assumed to be the least busiest airline (least flights); they have the least amount of observations in the dataset
- departure_delay_in_minutes and arrival_delay_in_minutes are heavily right skewed; their outliers should be handled before building the linear model. recommend using the winsor method to set the upper limit to a chose percentile (i.e. 95%)
- most flyers do not have a loyalty card, and most that do have between 1-5.
Revised business questions
- There are 25161 with only 1 loyalty card. Are they more/less satisfied with those with 0 or greater than 1?
- There are 0 with greater than 1 loyalty card. Are they more/less satisfied?
- Who has higher ratings, males or females?
- Why do customers centered around the age of 40 have higher ratings?
- Why do older customers (80 and above) and younger customers (20 and below) give lower ratings?
TODO
- Pivot tables or dplyr summary statistics
Plost histograms using createHist()
library(magrittr)
getBinWidth <- function(v) {
my_min <- min(v)
my_max <- max(v)
l <- length(unique(v))
return((my_max - my_min)/sqrt(l))
}
createIntHist <- function(df,cols,airlineName) {
for(c in cols) {
myVector <- df[[c]] # df must be in local environment
bins_binwidth <- getBinWidth(myVector) # call getBinWidth() to get binwidth argument value
g <- ggplot(df) + # create ggplot instance
geom_histogram(aes(x=myVector), binwidth=bins_binwidth, color="black", fill="white") +
labs(title=airlineName, x=c) +
theme(plot.title = element_text(hjust=0.5))
# from github: https://stackoverflow.com/questions/47000494/how-to-add-mean-and-mode-to-ggplot-histogram
data<- g %+% ggplot_build(g)$data
hist_peak<-data[[1]]%>%filter(y==max(y))%>%.$x
g <- g + geom_vline(aes(xintercept=mean(myVector)),col="black", lwd=1, linetype=2)
g <- g + annotate("text", label=round(hist_peak,1), x=hist_peak, y=0,vjust="bottom",col='orange',size=5)
g <- g + annotate("text", label=round(mean(myVector),1), x=round(mean(myVector),1), y=0,vjust=-1,col='red',size=5)
print(g)
}
}
Create barplots for factor datatypes
createFactorBarPlot <- function(df,cols,airlineName) {
for(col in cols) {
myVector <- df[[col]]
if(col=="airline_name") {
g <- ggplot(df, aes(x=myVector)) +
geom_bar(color="black",fill="white") +
geom_text(stat="count", aes(x=myVector, label=..count..), vjust=0) +
labs(title=col, x=col) +
theme(axis.text.x = element_text(angle=90, vjust=0))
print(g)
} else {
g <- ggplot(df, aes(x=myVector)) +
geom_bar(color="black",fill="white") +
geom_text(stat="count", aes(x=as.numeric(myVector), label=..count..), vjust=0) +
labs(title=airlineName, x=col)
print(g)
}
}
#return(data)
}
create barplots for discrete variables
createFactorBarPlot(df,c("satisfaction","gender","type_of_travel","class","airline_name","flight_cancelled","year_first_flight","num_loyalty_cards","airline_code"),"All Airlines")









Analyze satisfaction by age
#df$satisfaction <- factor(df$satisfaction, levels=c(1,2,3,4,5), ordered = T)
g <- ggplot(df, aes(x=age,y=factor(satisfaction))) +
geom_col(aes(fill=satisfaction))
g

- looks like the distribution for satisfied customers is centered around 40 year olds
g <- ggplot(df, aes(x=type_of_travel)) +
geom_bar(aes(fill=gender)) +
geom_text(stat="count", aes(x=type_of_travel, label=..count..), vjust=0)
g

- Males and females fly almost equally for Business and Mileage trips, but females fly more for Personal travel
creat histogram of satisfaction by airline
createSatHist <- function(airlineDf, airlineName) {
airlineDf$satisfaction <- airlineDf$satisfaction %>% as.numeric()
g <- ggplot(airlineDf) + # create ggplot instance
geom_bar(aes(x=satisfaction), color="black", fill="white") +
geom_text(stat="count", aes(x=satisfaction, label=..count..), vjust=0) +
scale_x_discrete(limits=c(1,2,3,4,5)) +
labs(title=airlineName, x=airlineName, y="satisfaction") +
theme(plot.title = element_text(hjust=0.5))
data <- g %+% ggplot_build(g)$data
hist_peak<-data[[1]]%>%filter(y==max(y))%>%.$x
g <- g + geom_vline(aes(xintercept=mean(airlineDf$satisfaction)),col="black", lwd=1, linetype=2)
g <- g + annotate("text", label=round(hist_peak,1), x=hist_peak,y=0,vjust="bottom",col='orange',size=5)
g <- g + annotate("text", label=round(mean(airlineDf$satisfaction),1), x=round(mean(airlineDf$satisfaction),1), y=0,vjust=-1,col='red',size=5)
# get the ratio of 4&5 ratings/all ratings
good_ratings_ratio <- round(length(which(airlineDf$satisfaction >= 4))/nrow(airlineDf),4)
#cat(sprintf("%s: %.4f\n", airlineName, good_ratings_ratio))
g <- g + annotate("text", label="High ratings ratio:", x=min(airlineDf$satisfaction), y=length(which(airlineDf$satisfaction==4)))
g <- g + annotate("text", label=good_ratings_ratio, x=min(airlineDf$satisfaction)+1, y=length(which(airlineDf$satisfaction==4)))
print(g)
}
get satisfaction barplot for all airlines
createSatHist(df, "All Airlines")

create histogram for all airline ratings
- The dashed line with red number is the mean satisfaction rating and the orange number is the mode (most frequent rating)
airlineNames <- c('Cheapseats', 'Cool&Young', 'EnjoyFlying', 'FlyFast', 'FlyHere', 'FlyToSun', 'GoingNorth', 'Northwest', 'OnlyJets', 'Oursin', 'PaulSmith', 'Sigma', 'Southeast', 'West')
for(airline in airlineNames) {
airlineDf <- df[which(df$airline_name==airline),]
airlineDf$satisfaction <- as.numeric(airlineDf$satisfaction)
createSatHist(airlineDf, airline)
}














get male/female customer count for CheapAirlines
cheap_male_female <- df %>% dplyr::select(airline_name,gender) %>% dplyr::filter(airline_name=="Cheapseats") %>% dplyr::group_by(gender) %>% dplyr::summarise(count=n())
g <- ggplot(cheap_male_female, aes(x=gender,y=count)) +
geom_bar(stat="identity", color="black",fill="white") +
labs(title="Cheapseats gender distribution", x="Gender")
g

- The dashed line with red number is the mean satisfaction rating and the orange number is the mode (most frequent rating)
- GoingNorth and OnlyJets have the lowest high ratings ratio at 48% and 49% respectively
- West and Cool&Young have the highest high ratings ratio at 57% and 55% respectively, but they are are the smallest airlines
Business quesitons
- Who has higher ratings, males or females?
- Why do customers centered around the age of 40 have higher ratings?
- Why do older customers (80 and above) and younger customers (20 and below) give lower ratings?
how many female, blue, economy, personal travelers are there?
df %>% dplyr::select(satisfaction,airline_name,gender,airline_status,type_of_travel) %>%
dplyr::filter(gender=="Female",airline_name=="Cheapseats",airline_status=="Blue",type_of_travel=="Personal") %>%
dplyr::group_by(satisfaction) %>%
dplyr::summarise(count=n()) %>%
dplyr::arrange(satisfaction)
## # A tibble: 5 x 2
## satisfaction count
## <fct> <int>
## 1 1 256
## 2 2 2434
## 3 3 1385
## 4 4 83
## 5 5 4
how many male, blue, economy, personal travelers are there?
df %>% dplyr::select(satisfaction,airline_name,gender,airline_status,type_of_travel) %>%
dplyr::filter(gender=="Male",airline_name=="Cheapseats",airline_status=="Blue",type_of_travel=="Personal") %>%
dplyr::group_by(satisfaction) %>%
dplyr::summarise(count=n()) %>%
dplyr::arrange(satisfaction)
## # A tibble: 5 x 2
## satisfaction count
## <fct> <int>
## 1 1 87
## 2 2 1201
## 3 3 667
## 4 4 101
## 5 5 10
how many blue, cheapseats, personal travelers are there in total
df %>% dplyr::select(satisfaction,airline_name,gender,airline_status,type_of_travel) %>%
dplyr::filter(airline_name=="Cheapseats",airline_status=="Blue",type_of_travel=="Personal") %>%
dplyr::group_by(satisfaction) %>%
dplyr::summarise(count=n()) %>%
dplyr::arrange(satisfaction)
## # A tibble: 5 x 2
## satisfaction count
## <fct> <int>
## 1 1 343
## 2 2 3635
## 3 3 2052
## 4 4 184
## 5 5 14
run summary statistics for Cheapseats
cheapseatsDf <- df[which(df$airline_name=="Cheapseats"),]
createFactorBarPlot(cheapseatsDf,c("airline_status","satisfaction","gender","type_of_travel","class","flight_cancelled","year_first_flight","num_loyalty_cards","airline_code"),"Cheapseats")









get blue status by gender
g <- ggplot(cheapseatsDf, aes(x=airline_status)) +
geom_bar(aes(fill=gender)) +
geom_text(stat="count", aes(x=airline_status, label=..count..), vjust=0) +
labs(title="Cheapseats status/gender")
g

get loyalty cards by gender
g <- ggplot(cheapseatsDf, aes(x=num_loyalty_cards)) +
geom_bar(aes(fill=gender)) +
geom_text(stat="count", aes(x=num_loyalty_cards, label=..count..), vjust=0) +
labs(title="Cheapseats loyalty/gender")
g

cheapseats satisfaction/gender
g <- ggplot(cheapseatsDf, aes(x=satisfaction)) +
geom_bar(aes(fill=gender)) +
geom_text(stat="count", aes(x=satisfaction, label=..count..), vjust=0) +
labs(title="Cheapseats satisfaction/gender")
g

cheapseats satisfaction/status
g <- ggplot(cheapseatsDf, aes(x=satisfaction)) +
geom_bar(aes(fill=airline_status)) +
geom_text(stat="count", aes(x=satisfaction, label=..count..), vjust=0) +
theme_bw() +
scale_fill_manual("airline_status", values = c("Blue" = "dodgerblue", "Gold" = "gold", "Platinum" = "slategrey", "Silver" = "grey89")) +
labs(title="Cheapseats satisfaction/status")
g

createIntHist(cheapseatsDf,c("num_flights","percent_flight_other_airlines","departure_delay_in_minutes","arrival_delay_in_minutes","flight_time_in_minutes","flight_distance","airport_shopping","airport_dining"),"Cheapseats")








Winsorize departure_delay_in_minutes and arrival_delay_in_minutes
- see how many rows fall above the 95th percentile
departure_delay_95 <- quantile(df$departure_delay_in_minutes, .95)
sprintf("%d rows will be changed to the 95th percentile of departure_delay_in_minutes", length(which(df$departure_delay_in_minutes > departure_delay_95)))
## [1] "6419 rows will be changed to the 95th percentile of departure_delay_in_minutes"
df[which(df$departure_delay_in_minutes > departure_delay_95), "departure_delay_in_minutes"] = departure_delay_95
arrival_delay_95 <- quantile(df$arrival_delay_in_minutes, .95)
sprintf("%d rows will be changed to the 95th percentile of arrival_delay_in_minutes", length(which(df$arrival_delay_in_minutes > arrival_delay_95)))
## [1] "6450 rows will be changed to the 95th percentile of arrival_delay_in_minutes"
df[which(df$arrival_delay_in_minutes > arrival_delay_95),"arrival_delay_in_minutes"] = arrival_delay_95