library(knitr)## Warning: package 'knitr' was built under R version 3.4.4
flights.data = read.csv('https://raw.githubusercontent.com/keshaws/CUNY_MSDS_2020/master/DATA607/fligthts.csv',sep = ',')
kable(flights.data)| Carrier | Status | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle |
|---|---|---|---|---|---|---|
| Alaska | on_time | 497 | 221 | 212 | 503 | 1841 |
| Alaska | delayed | 62 | 12 | 20 | 102 | 305 |
| AMWEST | on_time | 694 | 4840 | 383 | 320 | 201 |
| AMWEST | delayed | 117 | 415 | 65 | 129 | 61 |
library(tidyr)
library(dplyr)summary(flights.data)## Carrier Status Los.Angeles Phoenix San.Diego
## Alaska:2 delayed:2 Min. : 62.0 Min. : 12.0 Min. : 20.00
## AMWEST:2 on_time:2 1st Qu.:103.2 1st Qu.: 168.8 1st Qu.: 53.75
## Median :307.0 Median : 318.0 Median :138.50
## Mean :342.5 Mean :1372.0 Mean :170.00
## 3rd Qu.:546.2 3rd Qu.:1521.2 3rd Qu.:254.75
## Max. :694.0 Max. :4840.0 Max. :383.00
## San.Francisco Seattle
## Min. :102.0 Min. : 61
## 1st Qu.:122.2 1st Qu.: 166
## Median :224.5 Median : 253
## Mean :263.5 Mean : 602
## 3rd Qu.:365.8 3rd Qu.: 689
## Max. :503.0 Max. :1841
The dataset is untidy, so using gather function to convert the imported data into a ‘tidy’ format
flight <- gather(flights.data, City, Flight_Count, Los.Angeles:Seattle)
kable(flight)| Carrier | Status | City | Flight_Count |
|---|---|---|---|
| Alaska | on_time | Los.Angeles | 497 |
| Alaska | delayed | Los.Angeles | 62 |
| AMWEST | on_time | Los.Angeles | 694 |
| AMWEST | delayed | Los.Angeles | 117 |
| Alaska | on_time | Phoenix | 221 |
| Alaska | delayed | Phoenix | 12 |
| AMWEST | on_time | Phoenix | 4840 |
| AMWEST | delayed | Phoenix | 415 |
| Alaska | on_time | San.Diego | 212 |
| Alaska | delayed | San.Diego | 20 |
| AMWEST | on_time | San.Diego | 383 |
| AMWEST | delayed | San.Diego | 65 |
| Alaska | on_time | San.Francisco | 503 |
| Alaska | delayed | San.Francisco | 102 |
| AMWEST | on_time | San.Francisco | 320 |
| AMWEST | delayed | San.Francisco | 129 |
| Alaska | on_time | Seattle | 1841 |
| Alaska | delayed | Seattle | 305 |
| AMWEST | on_time | Seattle | 201 |
| AMWEST | delayed | Seattle | 61 |
total.flights <- flight %>% group_by(Carrier) %>%
summarize(Total_Flights = sum(Flight_Count))
total.flights## # A tibble: 2 x 2
## Carrier Total_Flights
## <fct> <int>
## 1 Alaska 3775
## 2 AMWEST 7225
flights.ontime <- flight %>% group_by(Carrier) %>%
filter(Status == 'on_time') %>%
summarize(FlightsOnTime = sum(Flight_Count))
flights.ontime## # A tibble: 2 x 2
## Carrier FlightsOnTime
## <fct> <int>
## 1 Alaska 3274
## 2 AMWEST 6438
flights.delayed <- flight %>% group_by(Carrier) %>%
filter(Status == 'delayed') %>%
summarize(FlightsDelayed = sum(Flight_Count))
flights.delayed## # A tibble: 2 x 2
## Carrier FlightsDelayed
## <fct> <int>
## 1 Alaska 501
## 2 AMWEST 787
flights.info = cbind(flights.ontime,FlightsDelayed = flights.delayed$FlightsDelayed, TotalFlights=total.flights$Total_Flights)
flights.info## Carrier FlightsOnTime FlightsDelayed TotalFlights
## 1 Alaska 3274 501 3775
## 2 AMWEST 6438 787 7225
flights.summary <- mutate(flights.info, PercentOnTime = FlightsOnTime/TotalFlights, PercentDelayed= FlightsDelayed/TotalFlights)
flights.summary## Carrier FlightsOnTime FlightsDelayed TotalFlights PercentOnTime
## 1 Alaska 3274 501 3775 0.8672848
## 2 AMWEST 6438 787 7225 0.8910727
## PercentDelayed
## 1 0.1327152
## 2 0.1089273
Carrier.Alaska <- filter(flight, Carrier == 'Alaska')
kable(Carrier.Alaska)| Carrier | Status | City | Flight_Count |
|---|---|---|---|
| Alaska | on_time | Los.Angeles | 497 |
| Alaska | delayed | Los.Angeles | 62 |
| Alaska | on_time | Phoenix | 221 |
| Alaska | delayed | Phoenix | 12 |
| Alaska | on_time | San.Diego | 212 |
| Alaska | delayed | San.Diego | 20 |
| Alaska | on_time | San.Francisco | 503 |
| Alaska | delayed | San.Francisco | 102 |
| Alaska | on_time | Seattle | 1841 |
| Alaska | delayed | Seattle | 305 |
Carrier.AMWEST <- filter(flight, Carrier == 'AMWEST')
kable(Carrier.AMWEST)| Carrier | Status | City | Flight_Count |
|---|---|---|---|
| AMWEST | on_time | Los.Angeles | 694 |
| AMWEST | delayed | Los.Angeles | 117 |
| AMWEST | on_time | Phoenix | 4840 |
| AMWEST | delayed | Phoenix | 415 |
| AMWEST | on_time | San.Diego | 383 |
| AMWEST | delayed | San.Diego | 65 |
| AMWEST | on_time | San.Francisco | 320 |
| AMWEST | delayed | San.Francisco | 129 |
| AMWEST | on_time | Seattle | 201 |
| AMWEST | delayed | Seattle | 61 |
For Alaskan airlines which city had most number of delays:
alaska.delays <- filter(Carrier.Alaska,Status=='delayed')
city.alaska.delays <- alaska.delays %>%
arrange(desc(alaska.delays$Flight_Count))
kable(city.alaska.delays)| Carrier | Status | City | Flight_Count |
|---|---|---|---|
| Alaska | delayed | Seattle | 305 |
| Alaska | delayed | San.Francisco | 102 |
| Alaska | delayed | Los.Angeles | 62 |
| Alaska | delayed | San.Diego | 20 |
| Alaska | delayed | Phoenix | 12 |
alaska.ontime <- filter(Carrier.Alaska,Status=='on_time')
city.alaska.ontime.rank <- alaska.ontime %>%
arrange(desc(alaska.ontime$Flight_Count))
kable(city.alaska.ontime.rank)| Carrier | Status | City | Flight_Count |
|---|---|---|---|
| Alaska | on_time | Seattle | 1841 |
| Alaska | on_time | San.Francisco | 503 |
| Alaska | on_time | Los.Angeles | 497 |
| Alaska | on_time | Phoenix | 221 |
| Alaska | on_time | San.Diego | 212 |
city.amwest.delays <- filter(Carrier.AMWEST, Status == 'delayed')
city.amwest.delays <- city.amwest.delays %>%
arrange(desc(city.amwest.delays$Flight_Count))
kable(city.amwest.delays)| Carrier | Status | City | Flight_Count |
|---|---|---|---|
| AMWEST | delayed | Phoenix | 415 |
| AMWEST | delayed | San.Francisco | 129 |
| AMWEST | delayed | Los.Angeles | 117 |
| AMWEST | delayed | San.Diego | 65 |
| AMWEST | delayed | Seattle | 61 |
city.flight.delays <- filter(flight, Status == 'delayed')
city.flight.delays <- city.flight.delays %>%
arrange(desc(city.flight.delays$Flight_Count))kable(city.flight.delays)| Carrier | Status | City | Flight_Count |
|---|---|---|---|
| AMWEST | delayed | Phoenix | 415 |
| Alaska | delayed | Seattle | 305 |
| AMWEST | delayed | San.Francisco | 129 |
| AMWEST | delayed | Los.Angeles | 117 |
| Alaska | delayed | San.Francisco | 102 |
| AMWEST | delayed | San.Diego | 65 |
| Alaska | delayed | Los.Angeles | 62 |
| AMWEST | delayed | Seattle | 61 |
| Alaska | delayed | San.Diego | 20 |
| Alaska | delayed | Phoenix | 12 |
library(ggplot2)## Warning: package 'ggplot2' was built under R version 3.4.4
ggplot(data = city.flight.delays, aes(x=Carrier,y=Flight_Count))+
#geom_point(alpha = 0.5, size = 5, color ='blue')
geom_bar(stat = 'identity',aes(fill=Carrier))+
geom_text(aes(x = Carrier, y = Flight_Count,
label = paste(Flight_Count),
group = Carrier,
vjust = -0.4)) +
labs(title = "Comparing Delays by Carrier & City",
x = "Carrier",
y = "Delay Flight Count") +
facet_wrap(~City, ncol = 5)+
theme_bw()spread function to view the dataset for delayed flights
flight.delay.spread <- spread(city.flight.delays,City, Flight_Count)kable(flight.delay.spread)| Carrier | Status | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle |
|---|---|---|---|---|---|---|
| Alaska | delayed | 62 | 12 | 20 | 102 | 305 |
| AMWEST | delayed | 117 | 415 | 65 | 129 | 61 |
flight.df <- flight %>%
spread(Status,'Flight_Count')
kable(flight.df)| Carrier | City | delayed | on_time |
|---|---|---|---|
| Alaska | Los.Angeles | 62 | 497 |
| Alaska | Phoenix | 12 | 221 |
| Alaska | San.Diego | 20 | 212 |
| Alaska | San.Francisco | 102 | 503 |
| Alaska | Seattle | 305 | 1841 |
| AMWEST | Los.Angeles | 117 | 694 |
| AMWEST | Phoenix | 415 | 4840 |
| AMWEST | San.Diego | 65 | 383 |
| AMWEST | San.Francisco | 129 | 320 |
| AMWEST | Seattle | 61 | 201 |
flight_percentile <- mutate(flight.df,delay_percent=round((delayed/(delayed+on_time)*100),2),
onTime_percent=round((on_time/(delayed+on_time))*100,2))
kable(flight_percentile)| Carrier | City | delayed | on_time | delay_percent | onTime_percent |
|---|---|---|---|---|---|
| Alaska | Los.Angeles | 62 | 497 | 11.09 | 88.91 |
| Alaska | Phoenix | 12 | 221 | 5.15 | 94.85 |
| Alaska | San.Diego | 20 | 212 | 8.62 | 91.38 |
| Alaska | San.Francisco | 102 | 503 | 16.86 | 83.14 |
| Alaska | Seattle | 305 | 1841 | 14.21 | 85.79 |
| AMWEST | Los.Angeles | 117 | 694 | 14.43 | 85.57 |
| AMWEST | Phoenix | 415 | 4840 | 7.90 | 92.10 |
| AMWEST | San.Diego | 65 | 383 | 14.51 | 85.49 |
| AMWEST | San.Francisco | 129 | 320 | 28.73 | 71.27 |
| AMWEST | Seattle | 61 | 201 | 23.28 | 76.72 |
library(ggplot2)
ggplot(data = flight_percentile, aes(x=Carrier,y=delay_percent))+
geom_bar(stat = 'identity',aes(fill=Carrier))+
geom_text(aes(x = Carrier, y = delay_percent,
label = paste(delay_percent,'%'),
group = Carrier,
vjust = -0.4)) +
labs(title = "Comparing Delays(%) by Carrier & City",
x = "Carrier",
y = "Delay Percent") +
facet_wrap(~City, ncol = 5)+
theme_bw()library(ggplot2)
ggplot(data = flight_percentile, aes(x=Carrier,y=onTime_percent))+
geom_bar(stat = 'identity',aes(fill=Carrier))+
geom_text(aes(x = Carrier, y = onTime_percent,
label = paste(onTime_percent,'%'),
group = Carrier,
vjust = -0.4)) +
labs(title = "Comparing OnTime(%) by Carrier & City",
x = "Carrier",
y = "OnTime Percent") +
facet_wrap(~City, ncol = 5)+
theme_bw()