flights.data = read.csv('https://raw.githubusercontent.com/niteen11/MSDS/master/DATA607/Week5/dataset/fligthts.csv',sep = ',')
flights.data
## Carrier Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on_time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AMWEST on_time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
Load required libraries
library(knitr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary(flights.data)
## Carrier Status Los.Angeles Phoenix San.Diego
## Alaska:2 delayed:2 Min. : 62.0 Min. : 12.0 Min. : 20.00
## AMWEST:2 on_time:2 1st Qu.:103.2 1st Qu.: 168.8 1st Qu.: 53.75
## Median :307.0 Median : 318.0 Median :138.50
## Mean :342.5 Mean :1372.0 Mean :170.00
## 3rd Qu.:546.2 3rd Qu.:1521.2 3rd Qu.:254.75
## Max. :694.0 Max. :4840.0 Max. :383.00
## San.Francisco Seattle
## Min. :102.0 Min. : 61
## 1st Qu.:122.2 1st Qu.: 166
## Median :224.5 Median : 253
## Mean :263.5 Mean : 602
## 3rd Qu.:365.8 3rd Qu.: 689
## Max. :503.0 Max. :1841
The loaded dataset is untidy, so let’s format into a ‘tidy’ format
flight <- gather(flights.data, City, Flight_Count, Los.Angeles:Seattle)
flight
## Carrier Status City Flight_Count
## 1 Alaska on_time Los.Angeles 497
## 2 Alaska delayed Los.Angeles 62
## 3 AMWEST on_time Los.Angeles 694
## 4 AMWEST delayed Los.Angeles 117
## 5 Alaska on_time Phoenix 221
## 6 Alaska delayed Phoenix 12
## 7 AMWEST on_time Phoenix 4840
## 8 AMWEST delayed Phoenix 415
## 9 Alaska on_time San.Diego 212
## 10 Alaska delayed San.Diego 20
## 11 AMWEST on_time San.Diego 383
## 12 AMWEST delayed San.Diego 65
## 13 Alaska on_time San.Francisco 503
## 14 Alaska delayed San.Francisco 102
## 15 AMWEST on_time San.Francisco 320
## 16 AMWEST delayed San.Francisco 129
## 17 Alaska on_time Seattle 1841
## 18 Alaska delayed Seattle 305
## 19 AMWEST on_time Seattle 201
## 20 AMWEST delayed Seattle 61
Let’s analyze the tidy dataset to compare the arrival delays.
Calculate the total number of flights by each carrier (airlines) to different destinations (Cities)
total.flights <- flight %>% group_by(Carrier) %>%
summarize(Total_Flights = sum(Flight_Count))
total.flights
## # A tibble: 2 x 2
## Carrier Total_Flights
## <fctr> <int>
## 1 Alaska 3775
## 2 AMWEST 7225
Let’s calculate the number of on-time flights for each carrier
flights.ontime <- flight %>% group_by(Carrier) %>%
filter(Status == 'on_time') %>%
summarize(FlightsOnTime = sum(Flight_Count))
flights.ontime
## # A tibble: 2 x 2
## Carrier FlightsOnTime
## <fctr> <int>
## 1 Alaska 3274
## 2 AMWEST 6438
Let’s find the number of delayed flights for each carrier
flights.delayed <- flight %>% group_by(Carrier) %>%
filter(Status == 'delayed') %>%
summarize(FlightsDelayed = sum(Flight_Count))
flights.delayed
## # A tibble: 2 x 2
## Carrier FlightsDelayed
## <fctr> <int>
## 1 Alaska 501
## 2 AMWEST 787
flights.info = cbind(flights.ontime,FlightsDelayed = flights.delayed$FlightsDelayed, TotalFlights=total.flights$Total_Flights)
flights.info
## Carrier FlightsOnTime FlightsDelayed TotalFlights
## 1 Alaska 3274 501 3775
## 2 AMWEST 6438 787 7225
flights.summary <- mutate(flights.info, PercentOnTime = FlightsOnTime/TotalFlights, PercentDelayed= FlightsDelayed/TotalFlights)
flights.summary
## Carrier FlightsOnTime FlightsDelayed TotalFlights PercentOnTime
## 1 Alaska 3274 501 3775 0.8672848
## 2 AMWEST 6438 787 7225 0.8910727
## PercentDelayed
## 1 0.1327152
## 2 0.1089273
Carrier.Alaska <- filter(flight, Carrier == 'Alaska')
Carrier.Alaska
## Carrier Status City Flight_Count
## 1 Alaska on_time Los.Angeles 497
## 2 Alaska delayed Los.Angeles 62
## 3 Alaska on_time Phoenix 221
## 4 Alaska delayed Phoenix 12
## 5 Alaska on_time San.Diego 212
## 6 Alaska delayed San.Diego 20
## 7 Alaska on_time San.Francisco 503
## 8 Alaska delayed San.Francisco 102
## 9 Alaska on_time Seattle 1841
## 10 Alaska delayed Seattle 305
Carrier.AMWEST <- filter(flight, Carrier == 'AMWEST')
Carrier.AMWEST
## Carrier Status City Flight_Count
## 1 AMWEST on_time Los.Angeles 694
## 2 AMWEST delayed Los.Angeles 117
## 3 AMWEST on_time Phoenix 4840
## 4 AMWEST delayed Phoenix 415
## 5 AMWEST on_time San.Diego 383
## 6 AMWEST delayed San.Diego 65
## 7 AMWEST on_time San.Francisco 320
## 8 AMWEST delayed San.Francisco 129
## 9 AMWEST on_time Seattle 201
## 10 AMWEST delayed Seattle 61
For Alaskan airlines which city had most number of delays:
alaska.delays <- filter(Carrier.Alaska,Status=='delayed')
city.alaska.delays <- alaska.delays %>%
arrange(desc(alaska.delays$Flight_Count))
city.alaska.delays
## Carrier Status City Flight_Count
## 1 Alaska delayed Seattle 305
## 2 Alaska delayed San.Francisco 102
## 3 Alaska delayed Los.Angeles 62
## 4 Alaska delayed San.Diego 20
## 5 Alaska delayed Phoenix 12
alaska.ontime <- filter(Carrier.Alaska,Status=='on_time')
city.alaska.ontime.rank <- alaska.ontime %>%
arrange(desc(alaska.ontime$Flight_Count))
city.alaska.ontime.rank
## Carrier Status City Flight_Count
## 1 Alaska on_time Seattle 1841
## 2 Alaska on_time San.Francisco 503
## 3 Alaska on_time Los.Angeles 497
## 4 Alaska on_time Phoenix 221
## 5 Alaska on_time San.Diego 212
city.amwest.delays <- filter(Carrier.AMWEST, Status == 'delayed')
city.amwest.delays <- city.amwest.delays %>%
arrange(desc(city.amwest.delays$Flight_Count))
city.amwest.delays
## Carrier Status City Flight_Count
## 1 AMWEST delayed Phoenix 415
## 2 AMWEST delayed San.Francisco 129
## 3 AMWEST delayed Los.Angeles 117
## 4 AMWEST delayed San.Diego 65
## 5 AMWEST delayed Seattle 61
Let’s use Spread function on the delayed dataset
city.alaska.delays.spread <- spread(city.alaska.delays, City, Flight_Count)
kable(city.alaska.delays.spread)
Carrier | Status | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle |
---|---|---|---|---|---|---|
Alaska | delayed | 62 | 12 | 20 | 102 | 305 |
library(ggplot2)
ggplot(data = city.alaska.delays, aes(x=City,y=Flight_Count))+
geom_point(alpha = 0.5, size = 5, color ='blue')+
geom_text(aes(x = City, y = Flight_Count,
label = paste(Flight_Count),
group = Carrier,
vjust = -0.5))
city.flight.delays <- filter(flight, Status == 'delayed')
city.flight.delays <- city.flight.delays %>%
arrange(desc(city.flight.delays$Flight_Count))
city.flight.delays
## Carrier Status City Flight_Count
## 1 AMWEST delayed Phoenix 415
## 2 Alaska delayed Seattle 305
## 3 AMWEST delayed San.Francisco 129
## 4 AMWEST delayed Los.Angeles 117
## 5 Alaska delayed San.Francisco 102
## 6 AMWEST delayed San.Diego 65
## 7 Alaska delayed Los.Angeles 62
## 8 AMWEST delayed Seattle 61
## 9 Alaska delayed San.Diego 20
## 10 Alaska delayed Phoenix 12
kable(city.flight.delays)
Carrier | Status | City | Flight_Count |
---|---|---|---|
AMWEST | delayed | Phoenix | 415 |
Alaska | delayed | Seattle | 305 |
AMWEST | delayed | San.Francisco | 129 |
AMWEST | delayed | Los.Angeles | 117 |
Alaska | delayed | San.Francisco | 102 |
AMWEST | delayed | San.Diego | 65 |
Alaska | delayed | Los.Angeles | 62 |
AMWEST | delayed | Seattle | 61 |
Alaska | delayed | San.Diego | 20 |
Alaska | delayed | Phoenix | 12 |
library(ggplot2)
ggplot(data = city.flight.delays, aes(x=Carrier,y=Flight_Count))+
#geom_point(alpha = 0.5, size = 5, color ='blue')
geom_bar(stat = 'identity',aes(fill=Carrier))+
geom_text(aes(x = Carrier, y = Flight_Count,
label = paste(Flight_Count),
group = Carrier,
vjust = -0.4)) +
labs(title = "Comparing Delays by Carrier & City",
x = "Carrier",
y = "Delay Flight Count") +
facet_wrap(~City, ncol = 5)+
theme_bw()
Let’s use spread function to view the dataset for delayed flights
flight.delay.spread <- spread(city.flight.delays,City, Flight_Count)
kable(flight.delay.spread)
Carrier | Status | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle |
---|---|---|---|---|---|---|
Alaska | delayed | 62 | 12 | 20 | 102 | 305 |
AMWEST | delayed | 117 | 415 | 65 | 129 | 61 |
flight.df <- flight %>%
spread(Status,'Flight_Count')
kable(flight.df)
Carrier | City | delayed | on_time |
---|---|---|---|
Alaska | Los.Angeles | 62 | 497 |
Alaska | Phoenix | 12 | 221 |
Alaska | San.Diego | 20 | 212 |
Alaska | San.Francisco | 102 | 503 |
Alaska | Seattle | 305 | 1841 |
AMWEST | Los.Angeles | 117 | 694 |
AMWEST | Phoenix | 415 | 4840 |
AMWEST | San.Diego | 65 | 383 |
AMWEST | San.Francisco | 129 | 320 |
AMWEST | Seattle | 61 | 201 |
flight_percentile <- mutate(flight.df,delay_percent=round((delayed/(delayed+on_time)*100),2),
onTime_percent=round((on_time/(delayed+on_time))*100,2))
kable(flight_percentile)
Carrier | City | delayed | on_time | delay_percent | onTime_percent |
---|---|---|---|---|---|
Alaska | Los.Angeles | 62 | 497 | 11.09 | 88.91 |
Alaska | Phoenix | 12 | 221 | 5.15 | 94.85 |
Alaska | San.Diego | 20 | 212 | 8.62 | 91.38 |
Alaska | San.Francisco | 102 | 503 | 16.86 | 83.14 |
Alaska | Seattle | 305 | 1841 | 14.21 | 85.79 |
AMWEST | Los.Angeles | 117 | 694 | 14.43 | 85.57 |
AMWEST | Phoenix | 415 | 4840 | 7.90 | 92.10 |
AMWEST | San.Diego | 65 | 383 | 14.51 | 85.49 |
AMWEST | San.Francisco | 129 | 320 | 28.73 | 71.27 |
AMWEST | Seattle | 61 | 201 | 23.28 | 76.72 |
library(ggplot2)
ggplot(data = flight_percentile, aes(x=Carrier,y=delay_percent))+
geom_bar(stat = 'identity',aes(fill=Carrier))+
geom_text(aes(x = Carrier, y = delay_percent,
label = paste(delay_percent,'%'),
group = Carrier,
vjust = -0.4)) +
labs(title = "Comparing Delays(%) by Carrier & City",
x = "Carrier",
y = "Delay Percent") +
facet_wrap(~City, ncol = 5)+
theme_bw()
Conclusion:
(a) From the above plot, it can be concluded that AMWEST has most delay rate across 5 cities as compared to Alaska.
(b) San Fransicso has highest delay rate for both carriers.
(c) Phoenix has lowest delay rate for both carriers.