rm (list=ls())
library(tidyverse)
library(plyr)
library(knitr)
require(kableExtra)
library(kableExtra)
library(tidyr)
library(dplyr)
my_flight <- read.csv("flight_data.txt")
my_flight
##   Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle  X
## 1  Alaska on time         497     221       212           503    1841 NA
## 2    <NA> delayed          62      12        20           102     305 NA
## 3 AM West on time         694    4840       383           320     201 NA
## 4    <NA> delayed         117     415        65           129      61 NA

Must address the blank Airline rows

kable(my_flight)
Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle X
Alaska on time 497 221 212 503 1841 NA
NA delayed 62 12 20 102 305 NA
AM West on time 694 4840 383 320 201 NA
NA delayed 117 415 65 129 61 NA
fix_columns <- function (df,col){
    for (x in seq_along(col)){
        if (!is.na(col[x])){
            y <- col[x]
        }
        else {
            col[x] <- y
        }
    }
    df$col <- col
    return (df$col)
}

my_flight$Airline <- fix_columns(my_flight,my_flight$Airline)
kable(my_flight)
Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle X
Alaska on time 497 221 212 503 1841 NA
Alaska delayed 62 12 20 102 305 NA
AM West on time 694 4840 383 320 201 NA
AM West delayed 117 415 65 129 61 NA

Can I apply this function in the tidyverse instead?

my_flight_2 <- as_data_frame(read.csv("flight_data.txt")) %>% 
    mutate(Airline=fix_columns(.,.$Airline))
kable(my_flight_2)
Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle X
Alaska on time 497 221 212 503 1841 NA
Alaska delayed 62 12 20 102 305 NA
AM West on time 694 4840 383 320 201 NA
AM West delayed 117 415 65 129 61 NA

Let’s tidy the dataframe

tidy_flight <- my_flight_2 %>% 
    select(-X) %>% 
    gather(City,N,3:7) %>% 
    spread(Arrival,N) %>% 
    mutate(City=str_replace(City,"\\."," ")) %>%    
    plyr::rename(.,c('on time'= 'on_time'))
    
kable(tidy_flight)
Airline City delayed on_time
Alaska Los Angeles 62 497
Alaska Phoenix 12 221
Alaska San Diego 20 212
Alaska San Francisco 102 503
Alaska Seattle 305 1841
AM West Los Angeles 117 694
AM West Phoenix 415 4840
AM West San Diego 65 383
AM West San Francisco 129 320
AM West Seattle 61 201

Exploratory analysis of flight delays

delay_by_airline <- tidy_flight %>% 
    mutate(delay_freq=round(delayed/(on_time+delayed)*100,2)) %>% 
    select(Airline, City, delay_freq) %>% 
    spread(City,delay_freq) %>% 
    plyr::rename(.,c('Airline'= 'Airline- percent of late takeoffs by City'))  
kable(delay_by_airline)
Airline- percent of late takeoffs by City Los Angeles Phoenix San Diego San Francisco Seattle
Alaska 11.09 5.15 8.62 16.86 14.21
AM West 14.43 7.90 14.51 28.73 23.28
tidy_flight %>% 
  dplyr::group_by(Airline,City, delayed,on_time) %>% 
  dplyr::summarize(.) %>%
    mutate(total_flight=delayed+on_time) %>% 
    dplyr::group_by(Airline) %>% 
    plyr::rename(.,c('Airline'= 'Airline- average percent of late takeoffs by airline')) %>% 
    dplyr::summarize(mean_delayed_flights_percantage=mean(delayed/total_flight)*100) %>% 
    kable()
Airline mean_delayed_flights_percantage
Alaska 11.18683
AM West 17.76915
mean_airline_arriveal_stats <-  tidy_flight %>% 
    select(Airline,delayed,City, on_time) %>% 
    dplyr::group_by(Airline) %>% 
    dplyr::summarize(average_airport_delay_total_per_airport=mean(delayed))
kable(mean_airline_arriveal_stats)
Airline average_airport_delay_total_per_airport
Alaska 100.2
AM West 157.4
delay_by_city <- tidy_flight %>% 
    mutate(total_flights=delayed+on_time) %>% 
    mutate(delay_freq=round(delayed/(on_time+delayed)*100,2)) %>%  
    dplyr::group_by(Delay_by_city=City) %>% 
    dplyr::summarize(freq_of_delay_percantage=mean(delay_freq),total_takeoffs=sum(total_flights))
kable(delay_by_city)    
Delay_by_city freq_of_delay_percantage total_takeoffs
Los Angeles 12.760 1370
Phoenix 6.525 5488
San Diego 11.565 680
San Francisco 22.795 1054
Seattle 18.745 2408
#mutate(total_flights=delaye+on_time)

Conclusions