Libraries

library(tidyr)
library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v readr   1.3.1     v stringr 1.4.0
## v ggplot2 3.2.1     v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)

Initial Tidying

First, let’s import our data and take a quick look at it:

airlineData <- as_tibble(read.csv('airlineData.csv', header = TRUE, stringsAsFactors = FALSE))
airlineData
## # A tibble: 5 x 7
##   X       X.1     Los.Angeles Phoenix San.Diego San.Francisco Seattle
##   <chr>   <chr>         <int>   <int>     <int>         <int>   <int>
## 1 ALASKA  on time         497     221       212           503    1841
## 2 ""      delayed          62      12        20           102     305
## 3 ""      ""               NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5 ""      delayed         117     415        65           129      61

The first 2 columns aren’t labeled so let’s put some column names on them. While we’re at it, let’s eliminate the blank row separating the 2 airlines and bring the Airline name down into rows that have a status of delayed.

airlineData <- airlineData %>% 
  rename(AIRLINE = X, STATUS = X.1)%>% 
  filter(STATUS != '')

# replace blank with airline above
airlineData$AIRLINE[airlineData$AIRLINE == ''] <- airlineData$AIRLINE[str_detect(airlineData$AIRLINE,'[[:alpha:]]+')]

airlineData
## # A tibble: 4 x 7
##   AIRLINE STATUS  Los.Angeles Phoenix San.Diego San.Francisco Seattle
##   <chr>   <chr>         <int>   <int>     <int>         <int>   <int>
## 1 ALASKA  on time         497     221       212           503    1841
## 2 ALASKA  delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61

The data looks better, but it’s still not completely tidy. According to Wickham, each variable in the dataset should have its own column. In the current structure of the dataset, we have 2 problems:

airlineData <- airlineData %>% 
  gather(CITY,NUM_FLIGHTS, -AIRLINE, -STATUS) %>%
  spread(STATUS, NUM_FLIGHTS)

colnames(airlineData) <- c('AIRLINE', 'CITY', 'DELAYED', 'ON_TIME')

airlineData
## # A tibble: 10 x 4
##    AIRLINE CITY          DELAYED ON_TIME
##    <chr>   <chr>           <int>   <int>
##  1 ALASKA  Los.Angeles        62     497
##  2 ALASKA  Phoenix            12     221
##  3 ALASKA  San.Diego          20     212
##  4 ALASKA  San.Francisco     102     503
##  5 ALASKA  Seattle           305    1841
##  6 AM WEST Los.Angeles       117     694
##  7 AM WEST Phoenix           415    4840
##  8 AM WEST San.Diego          65     383
##  9 AM WEST San.Francisco     129     320
## 10 AM WEST Seattle            61     201

Analysis

Now that we have tidy data, we can compare the number of delays for each of the airlines. We can see from the summary that AM WEST has more delays than ALASKA!

airlineData %>% 
  group_by(AIRLINE) %>%
  summarise(TOTAL_DELAYS = sum(DELAYED), 
            AVG_NUM_DELAYS = mean(DELAYED),
            MEDIAN_DELAYS = median(DELAYED),
            MIN_DELAYS = min(DELAYED), 
            MAX_DELAYS = max(DELAYED)) 
## # A tibble: 2 x 6
##   AIRLINE TOTAL_DELAYS AVG_NUM_DELAYS MEDIAN_DELAYS MIN_DELAYS MAX_DELAYS
##   <chr>          <int>          <dbl>         <int>      <int>      <int>
## 1 ALASKA           501           100.            62         12        305
## 2 AM WEST          787           157.           117         61        415