cat("Number of Distinct Carriers: ", n_distinct(flights$carrier))
Number of Distinct Carriers: 16
Summarize
summary(flights)
year month day dep_time sched_dep_time
Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
NA's :8255
dep_delay arr_time sched_arr_time arr_delay
Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
Median : -2.00 Median :1535 Median :1556 Median : -5.000
Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
NA's :8255 NA's :8713 NA's :9430
carrier flight tailnum origin
Length:336776 Min. : 1 Length:336776 Length:336776
Class :character 1st Qu.: 553 Class :character Class :character
Mode :character Median :1496 Mode :character Mode :character
Mean :1972
3rd Qu.:3465
Max. :8500
dest air_time distance hour
Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
Mode :character Median :129.0 Median : 872 Median :13.00
Mean :150.7 Mean :1040 Mean :13.18
3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
Max. :695.0 Max. :4983 Max. :23.00
NA's :9430
minute time_hour
Min. : 0.00 Min. :2013-01-01 05:00:00.00
1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00.00
Median :29.00 Median :2013-07-03 10:00:00.00
Mean :26.23 Mean :2013-07-03 05:22:54.64
3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00.00
Max. :59.00 Max. :2013-12-31 23:00:00.00
Graph On-Time Performance using Departure Delay and Arrival Delay
I work with a team that develops software for the transit industry, namely taxi, sedan, and paratransit markets. Some of the most important data that is collected for reporting is to analyze key performance indicators (KPIs) and the subset that agencies look at the most is “On-Time Performance” which is usually defined as arriving at the origin location within 15 minutes of the requested/scheduled pickup time. After googling to determine whether the airline industry uses the same metric, it turned out they do. So I worked on making a bidirectional bar graph that has both the departure delay percentage and arrival delay percentage for each carrier (16 in total). I tried exported the dataset to make sure the formula was correct, but it was taking too long, so not sure if it’s correct because I’m more familiar with SQL, but I guess it’s a learning process.
# Calculate the percentage of flights with less than 15 minutes delay (OTP-Punctuality)delay_punctuality <- flights |>group_by(carrier) |>summarize(Departure_Percentage =sum(dep_delay <=15, na.rm =TRUE) /n() *100,Arrival_Percentage =sum(arr_delay <=15, na.rm =TRUE) /n() *100)# Join the delay_punctuality dataset with the airlines datasetdelay_punctuality_labels <-left_join(delay_punctuality, airlines, by ="carrier")delay_punctuality_labels$name <-gsub("Inc\\.|Co\\.", "", delay_punctuality_labels$name)# Create a bidirectional horizontal bar chartggplot(delay_punctuality_labels, aes(x =-Departure_Percentage, y =reorder(name, Departure_Percentage))) +geom_text(aes(label =paste0(round(Departure_Percentage, 0), "%")), hjust =1.1, size =4) +#departure % labelsgeom_bar(aes(fill ="Departure_Percentage"), stat ="identity", width = .75) +geom_bar(aes(x = Arrival_Percentage, fill ="Arrival_Percentage"), stat ="identity", width = .75) +geom_text(aes(x = Arrival_Percentage, label =paste0(round(Arrival_Percentage, 0), "%")), hjust =-.1, size =4) +# arrival % labelslabs(x ="Departures < On-Time Performance > Arrivals", y ="Carrier",title ="On-Time Performance of Airline Carriers",caption ="(% of Flights < 15 Minutes Delay)") +scale_fill_manual(name ="Performance",breaks =c("Departure_Percentage", "Arrival_Percentage"), # Specify the order of legend itemsvalues =c("Departure_Percentage"="#8bd3c7", "Arrival_Percentage"="#beb9db"),labels =c("Departure_Percentage"="Departure", "Arrival_Percentage"="Arrival") ) +scale_x_continuous(labels = abs, limits =c(-120, 120)) +# Positive negative axistheme_minimal() +theme(panel.grid.major.x =element_blank(),panel.grid.minor.x =element_blank(),panel.grid.major.y =element_blank(),panel.grid.minor.y =element_blank(),axis.text =element_text(size =12), axis.title.y =element_blank(), plot.title =element_text(hjust = .36, size=14),axis.title.x =element_text(hjust =0, size =12, vjust =-0.75),plot.caption =element_text(hjust = .5, size=10, vjust =-0.75),plot.margin =margin(20, 10, 20, 10) )