Airlines across five destinations

CUNY MSDS - DATA607 - Home Work-5

James Kuruvilla

September 30, 2017

Assignment

————————————————————————————————————
————————————————————————————————————

Library Definition

library(tidyverse)

Read CSV File from GitHub

I created a .csv file named assignment5_csv.csv and saved that in GitHub.

GitHub URL

url <- 'https://raw.githubusercontent.com/jameskuruvilla/DATA607/master/Assignment%205_CSV.csv'

Read CSV file from URL using read_csv()

name <- c("Airlines","Status","Los_Angeles","Phoenix","San_Diego", "San_Francisco","Seattle")

types = 'ccccccc'
airlines <- read_csv(url,col_names = name, col_types =types)

airlines

Search NULL rows and remove.

null_airlines <- which(is.na(airlines$Status)) 

airlines<-airlines[-null_airlines,]
airlines
#airlines<-airlines%>%fill(Airlines)
#airlines

Replace NULL values in the first column with the previous value of the same column

for (i in 1:nrow(airlines[1]))
{
  if (is.na(airlines[i,1])) 
    {airlines[i,1]<- airlines[i-1,1]}
}

airlines

Use gather() from the library ‘tidyverse’ to make the tibble long and tidy

airlines_long <- gather(airlines, City,No.Of.Flights, Los_Angeles:Seattle)
  
airlines_long$No.Of.Flights <- as.numeric(airlines_long$No.Of.Flights)
airlines_long

Use spread() from the library ‘tidyverse’ to make wide

airlines_wide <- spread(airlines_long,Status,No.Of.Flights)

#Column name 'on time' is changed to 'On_Time'
colnames(airlines_wide)[4]<- 'On_time'

airlines_wide

Find total number of flights for each city

airlines_wide_totals <- airlines_wide

airlines_wide_totals$Totals <- airlines_wide_totals$delayed + airlines_wide_totals$On_time 

airlines_wide_totals

Find the Probability of arrival delays of the two airlines by city

airlines_final <- airlines_wide_totals

airlines_final$Ratio <-((airlines_final$delayed/airlines_final$Totals)*100) %>% round(digits=2)
airlines_final<-airlines_final%>% arrange(City,Ratio)

airlines_final$Ratio <- paste(as.character(airlines_final$Ratio),'%')

airlines_final

Conclusion 1

For Example, in Los Angeles, the probability of a delayed flight by ALASKA airlines is slightly LOWER than that of AM WEST Airlines (11.09% < 14.43%) and similarly for all the cities. Please notice the probability of the delayed flight when we consider the overall delay ratio of the two airlines below

Overall Probability of arrival delays of the two airlines

airlines_delay_ratio <- airlines_wide_totals

airlines_delay_ratio<- airlines_delay_ratio%>%
                       select(1,3,4,5)     %>% 
                       group_by(Airlines)  %>%
                       summarise(Total_delay = sum(delayed), Total_On_Time = sum(On_time))

airlines_delay_ratio$Total <- airlines_delay_ratio$Total_delay + airlines_delay_ratio$Total_On_Time

airlines_delay_ratio$Over_All_Delay_Ratio <- 
                      ((airlines_delay_ratio$Total_delay/airlines_delay_ratio$Total)*100)%>% 
                      round(digit=2)

airlines_delay_ratio$Over_All_Delay_Ratio <- 
                           paste(as.character(airlines_delay_ratio$Over_All_Delay_Ratio), '%')

airlines_delay_ratio

Conclusion 2

When we consider probability of a delayed flight across all the cities, ALASKA Airline carries a HIGHER probability of having a delayed flight. This is because of the difference in sample size for both airlines.

Probability of a randomly selected flight to be delayed

delay_ratio <- airlines_delay_ratio

delay_ratio <- airlines_delay_ratio%>% select(2,3,4)%>%
               summarise(Total_delay = sum(Total_delay), Total_On_Time = sum(Total_On_Time))
delay_ratio$Total <- delay_ratio$Total_delay+delay_ratio$Total_On_Time

delay_ratio$Pro_Delay <-((delay_ratio$Total_delay/delay_ratio$Total_On_Time)*100)%>%round(digit=2)

delay_ratio

Conclusion 3

The probability of a randomly selected flight across the cities to be delayed is 13.26 % (0.1326)