library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(ggplot2)

Gather Data

Treat empty columns as NA, while reading file as csv.

Airlines <- read.csv("https://raw.githubusercontent.com/uplotnik/Data607/master/Airlines.csv", na.strings = c("", "NA"),
                    sep = ",", header = TRUE)

Airlines 
##         X     X.1 Los.Angeles Phonexi San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221        21           503    1841
## 2    <NA> delayed          62      12        20           102     305
## 3    <NA>    <NA>          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5    <NA> delayed         117     415        65           129      61

Data Manipulation

Change column names

names(Airlines)[1] <- "Carrier"
names(Airlines)[2] <- "Flight_status"
Airlines
##   Carrier Flight_status Los.Angeles Phonexi San.Diego San.Francisco
## 1  ALASKA       on time         497     221        21           503
## 2    <NA>       delayed          62      12        20           102
## 3    <NA>          <NA>          NA      NA        NA            NA
## 4 AM WEST       on time         694    4840       383           320
## 5    <NA>       delayed         117     415        65           129
##   Seattle
## 1    1841
## 2     305
## 3      NA
## 4     201
## 5      61

Fill the NA’s with the corresponding name

Airlines <- Airlines %>% fill(Carrier)
Airlines
##   Carrier Flight_status Los.Angeles Phonexi San.Diego San.Francisco
## 1  ALASKA       on time         497     221        21           503
## 2  ALASKA       delayed          62      12        20           102
## 3  ALASKA          <NA>          NA      NA        NA            NA
## 4 AM WEST       on time         694    4840       383           320
## 5 AM WEST       delayed         117     415        65           129
##   Seattle
## 1    1841
## 2     305
## 3      NA
## 4     201
## 5      61

Use ‘gather’ function to re-structure the table.

We can gather each city within one column variable and also gather the values associated with each city in a second column variable.

Airlines <- Airlines %>% gather(City,  Number, Los.Angeles:Seattle)
head(Airlines)
##   Carrier Flight_status        City Number
## 1  ALASKA       on time Los.Angeles    497
## 2  ALASKA       delayed Los.Angeles     62
## 3  ALASKA          <NA> Los.Angeles     NA
## 4 AM WEST       on time Los.Angeles    694
## 5 AM WEST       delayed Los.Angeles    117
## 6  ALASKA       on time     Phonexi    221

Spread “delayed”" and “on time” flights

Airline <- spread(Airlines,"Flight_status", "Number")
head(Airline)
##   Carrier          City delayed on time <NA>
## 1  ALASKA   Los.Angeles      62     497   NA
## 2  ALASKA       Phonexi      12     221   NA
## 3  ALASKA     San.Diego      20      21   NA
## 4  ALASKA San.Francisco     102     503   NA
## 5  ALASKA       Seattle     305    1841   NA
## 6 AM WEST   Los.Angeles     117     694   NA

Change column name “on time” with one word “Timely”

(in order to avoid errors with future manipulations)

Airline<- dplyr::rename(Airline, 'Timely'='on time')
Airline
##    Carrier          City delayed Timely <NA>
## 1   ALASKA   Los.Angeles      62    497   NA
## 2   ALASKA       Phonexi      12    221   NA
## 3   ALASKA     San.Diego      20     21   NA
## 4   ALASKA San.Francisco     102    503   NA
## 5   ALASKA       Seattle     305   1841   NA
## 6  AM WEST   Los.Angeles     117    694   NA
## 7  AM WEST       Phonexi     415   4840   NA
## 8  AM WEST     San.Diego      65    383   NA
## 9  AM WEST San.Francisco     129    320   NA
## 10 AM WEST       Seattle      61    201   NA

Data Analysis

Find total average delayed flights by city

## Ave delay for each city 
delay_avg <- Airline %>%
  group_by(City) %>%
summarise(Ave_City_Delay= round(mean(delayed, na.rm = TRUE), digits=5)) %>% 
 ##summarise(Ave_City_Timely= round(mean(Timely, na.rm = TRUE), digits=2))%>%
 arrange(.$Ave_City_Delay) 
delay_avg 
## # A tibble: 5 x 2
##   City          Ave_City_Delay
##   <chr>                  <dbl>
## 1 San.Diego               42.5
## 2 Los.Angeles             89.5
## 3 San.Francisco          116. 
## 4 Seattle                183  
## 5 Phonexi                214.

Find total average flights arrived on time by city

## Ave arrival on time for each city 

Timely_avg <- Airline %>%
  group_by(City) %>%
summarise(Ave_City_Timely= round(mean(Timely, na.rm = TRUE), digits=5)) %>%
  arrange(.$Ave_City_Timely)
Timely_avg 
## # A tibble: 5 x 2
##   City          Ave_City_Timely
##   <chr>                   <dbl>
## 1 San.Diego                202 
## 2 San.Francisco            412.
## 3 Los.Angeles              596.
## 4 Seattle                 1021 
## 5 Phonexi                 2530.

Join both tables by City

City_tab <- left_join(delay_avg, Timely_avg, by = c('City'))%>%
  arrange(.$City)
City_tab
## # A tibble: 5 x 3
##   City          Ave_City_Delay Ave_City_Timely
##   <chr>                  <dbl>           <dbl>
## 1 Los.Angeles             89.5            596.
## 2 Phonexi                214.            2530.
## 3 San.Diego               42.5            202 
## 4 San.Francisco          116.             412.
## 5 Seattle                183             1021

Find average delayed flights for each Airline

## Find average of delays for each Airline
delay_avg_carrier <- Airline %>%
  group_by(Carrier) %>%
summarise(Ave_delayed= round(mean(delayed, na.rm = TRUE), digits=2)) %>%
  arrange(.$Ave_delayed)
delay_avg_carrier 
## # A tibble: 2 x 2
##   Carrier Ave_delayed
##   <fct>         <dbl>
## 1 ALASKA         100.
## 2 AM WEST        157.

Find average flights arrived on time for each Airline

timely_avg_carrier <- Airline %>%
  group_by(Carrier) %>%
summarise(Ave_timely= round(mean(Timely, na.rm = TRUE), digits=5)) %>%
  arrange(.$Ave_timely)
head(timely_avg_carrier) 
## # A tibble: 2 x 2
##   Carrier Ave_timely
##   <fct>        <dbl>
## 1 ALASKA        617.
## 2 AM WEST      1288.

Join both tables by “Carrier”

Carrier1 <- left_join(delay_avg_carrier, timely_avg_carrier, by = c('Carrier'))%>%
  arrange(.$Carrier)
Carrier1
## # A tibble: 2 x 3
##   Carrier Ave_delayed Ave_timely
##   <fct>         <dbl>      <dbl>
## 1 ALASKA         100.       617.
## 2 AM WEST        157.      1288.

Gather Ave_status

Airlines1 <- Carrier1 %>% gather(Ave_status,  Flights, Ave_delayed:Ave_timely)
head(Airlines1)
## # A tibble: 4 x 3
##   Carrier Ave_status  Flights
##   <fct>   <chr>         <dbl>
## 1 ALASKA  Ave_delayed    100.
## 2 AM WEST Ave_delayed    157.
## 3 ALASKA  Ave_timely     617.
## 4 AM WEST Ave_timely    1288.

Vizualization

ggplot(
  Airlines1, aes(x = Carrier, y = Flights,fill=Ave_status)) + 
  geom_bar(stat="identity") +
  ggtitle("Airlines: AVE DELAY vs AVE ON Time")+ 
  theme(axis.text=element_text(angle=90))+
  labs(x="City",y="Flights")

ggplot(
  City_tab, aes(x = City, y = Ave_City_Timely ,fill=Ave_City_Timely)) + 
  geom_bar(stat="identity") +
  ggtitle("City: Ave ON TIME")+ 
  theme(axis.text=element_text(angle=90))+
  labs(x="City",y="Ave_City_Timely")