Flight Analysis

Loading package

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(stringr) 
library(tidyr)
library(dplyr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Create a .csv file

 ## Used Rbind to generate the reproducible data
flights <- rbind(c(NA, NA, "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
             c("ALASKA", "on time", 497, 221, 212, 503, 1841),
             c(NA, "delayed", 62, 12, 20, 102, 305),
             c(NA, NA, NA, NA, NA, NA, NA),
             c("AM WEST", "on time", 694, 4840, 383, 320, 201),
             c(NA, "delayed", 117, 415, 65, 129, 61))

write.table(flights, file = "C:\\cuny education material\\Data607_projects_git\\flight.csv", sep = ",", col.names=F, row.names=F)

Read .CSV file

#display data in tabular format using kable

flights <-read.csv(paste("https://raw.githubusercontent.com/yathdeep/data607_labs/main/flight.csv"))


#fill the airline value in the NA place in 1 column,2nd and 4 row
#and also remove space between on time value

flights[2,1]<-flights[1,1]
flights[5,1]<-flights[4,1]

flights[,2] <- sapply(flights[,2], str_replace, " ", "_")

kable(flights) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% 
  scroll_box(width = "100%", height = "700px") 
NA. NA..1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
ALASKA on_time 497 221 212 503 1841
ALASKA delayed 62 12 20 102 305
NA NA NA NA NA NA NA
AM WEST on_time 694 4840 383 320 201
AM WEST delayed 117 415 65 129 61

Tidying Data

##rename NA. column with Airlines and NA.1 column with Status

#gather() will be used to transform dataset from its untidy form
#to a normalized tidy long form to fulfill the rule to convert variables into
#observation. Here, variables for destinations like Los.Angeles, San.Francisco
#etc. Seattle became observations.


tidy_airlines_data<-flights%>%na.omit()%>%rename("Airlines"="NA.","Status"="NA..1")%>%gather("Destination","Number of Flights",3:7)
 
tidy_airlines_data<-tidy_airlines_data%>%arrange(Airlines,desc(`Number of Flights`))

max_numberofflightsindicator<-which.max(tidy_airlines_data$`Number of Flights`)


kable(tidy_airlines_data) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
   row_spec(max_numberofflightsindicator, background = "pink") %>% 
  scroll_box(width = "100%", height = "700px")
Airlines Status Destination Number of Flights
ALASKA on_time Seattle 1841
ALASKA on_time San.Francisco 503
ALASKA on_time Los.Angeles 497
ALASKA delayed Seattle 305
ALASKA on_time Phoenix 221
ALASKA on_time San.Diego 212
ALASKA delayed San.Francisco 102
ALASKA delayed Los.Angeles 62
ALASKA delayed San.Diego 20
ALASKA delayed Phoenix 12
AM WEST on_time Phoenix 4840
AM WEST on_time Los.Angeles 694
AM WEST delayed Phoenix 415
AM WEST on_time San.Diego 383
AM WEST on_time San.Francisco 320
AM WEST on_time Seattle 201
AM WEST delayed San.Francisco 129
AM WEST delayed Los.Angeles 117
AM WEST delayed San.Diego 65
AM WEST delayed Seattle 61

Wide Data format

#spread is being used to transform data to wide form for better analysis.

wide_tidy_flights<- tidy_airlines_data %>%
  spread(key = Status, value = `Number of Flights`)

kable(wide_tidy_flights) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  scroll_box(width = "100%", height = "700px")
Airlines Destination delayed on_time
ALASKA Los.Angeles 62 497
ALASKA Phoenix 12 221
ALASKA San.Diego 20 212
ALASKA San.Francisco 102 503
ALASKA Seattle 305 1841
AM WEST Los.Angeles 117 694
AM WEST Phoenix 415 4840
AM WEST San.Diego 65 383
AM WEST San.Francisco 129 320
AM WEST Seattle 61 201

Data Analyis Cont’d Using Dplyr

flights_with_otherstats <-wide_tidy_flights%>%
  mutate(total_flights=delayed + on_time,
         on_time_percent = round(on_time/total_flights*100, 2),
         delayed_percent = round(delayed/total_flights*100, 2))

kable(flights_with_otherstats) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  scroll_box(width = "100%", height = "700px")
Airlines Destination delayed on_time total_flights on_time_percent delayed_percent
ALASKA Los.Angeles 62 497 559 88.91 11.09
ALASKA Phoenix 12 221 233 94.85 5.15
ALASKA San.Diego 20 212 232 91.38 8.62
ALASKA San.Francisco 102 503 605 83.14 16.86
ALASKA Seattle 305 1841 2146 85.79 14.21
AM WEST Los.Angeles 117 694 811 85.57 14.43
AM WEST Phoenix 415 4840 5255 92.10 7.90
AM WEST San.Diego 65 383 448 85.49 14.51
AM WEST San.Francisco 129 320 449 71.27 28.73
AM WEST Seattle 61 201 262 76.72 23.28
#arrange the flights to show most delays -in the order of airlines 

flight_delays<-flights_with_otherstats%>%arrange(Airlines,desc(delayed_percent))

kable(flight_delays) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  scroll_box(width = "100%", height = "700px")
Airlines Destination delayed on_time total_flights on_time_percent delayed_percent
ALASKA San.Francisco 102 503 605 83.14 16.86
ALASKA Seattle 305 1841 2146 85.79 14.21
ALASKA Los.Angeles 62 497 559 88.91 11.09
ALASKA San.Diego 20 212 232 91.38 8.62
ALASKA Phoenix 12 221 233 94.85 5.15
AM WEST San.Francisco 129 320 449 71.27 28.73
AM WEST Seattle 61 201 262 76.72 23.28
AM WEST San.Diego 65 383 448 85.49 14.51
AM WEST Los.Angeles 117 694 811 85.57 14.43
AM WEST Phoenix 415 4840 5255 92.10 7.90
ontimeflights<-flights_with_otherstats%>%arrange(desc(on_time_percent))

kable(ontimeflights) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  scroll_box(width = "100%", height = "700px")
Airlines Destination delayed on_time total_flights on_time_percent delayed_percent
ALASKA Phoenix 12 221 233 94.85 5.15
AM WEST Phoenix 415 4840 5255 92.10 7.90
ALASKA San.Diego 20 212 232 91.38 8.62
ALASKA Los.Angeles 62 497 559 88.91 11.09
ALASKA Seattle 305 1841 2146 85.79 14.21
AM WEST Los.Angeles 117 694 811 85.57 14.43
AM WEST San.Diego 65 383 448 85.49 14.51
ALASKA San.Francisco 102 503 605 83.14 16.86
AM WEST Seattle 61 201 262 76.72 23.28
AM WEST San.Francisco 129 320 449 71.27 28.73
#Alaskan airline has better ontime record compared to its competitor

Data Visulization

ggplot(flights_with_otherstats, aes(x=Destination, y=delayed_percent, fill=Airlines))+
  geom_bar(stat="identity", position=position_dodge()) +
  theme_classic()+
  geom_text(aes(label=delayed_percent),  hjust=1,position= position_dodge(width=0.9),  color="black") +
        ggtitle("Flights Delayed by Airline by City %") +
    xlab("Destination") + ylab("Flights Delayed %") +
        coord_flip() 

## More comparison

airlines_compared<-flights_with_otherstats%>%group_by(Airlines)%>%
            summarise(flight_on_time = sum(on_time), flights_delayed = sum(delayed), 
            percent_on_time = round(flight_on_time/sum(flight_on_time,flights_delayed )*100,2),
            percent_delay = round(flights_delayed/sum(flight_on_time,flights_delayed)*100,2))%>%
            select(Airlines,percent_on_time,percent_delay)
## `summarise()` ungrouping output (override with `.groups` argument)
airlines_compared %>% arrange(desc(percent_on_time))
## # A tibble: 2 x 3
##   Airlines percent_on_time percent_delay
##   <chr>              <dbl>         <dbl>
## 1 AM WEST             89.1          10.9
## 2 ALASKA              86.7          13.3
#view the above comparison on the chart
ggplot(airlines_compared, aes(x=Airlines, y=percent_delay , fill=Airlines))+
  geom_bar(stat="identity", position=position_dodge(),width = 0.3) +
  theme_classic()+
  ggtitle("Overall % Of Delayed Flights by Airlines") +
  xlab("AirLines") + ylab("Delayed %") 

#From above we can see that that AM WEST Airlines performed better than ALASKAN
#but on the other hand we noticed earlier than top 4 best performances were from
#ALASKAN airlines so there is some anomaly in this finding which needs to be 
#investigated further.