Tidying and Transforming Data

Import the flights csv file which describes arrivals and delays for two airlines across five destinations. The CSV file was created and exported in MySQL Workbench 6.3 CE.

Load libraries

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'dplyr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidyr)
library(dplyr)
library(knitr)

Read the flights csv file

flights <- read_csv("https://raw.githubusercontent.com/LilesB/DATA-607-5WK-Assignment/master/flights.csv")
## Parsed with column specification:
## cols(
##   AID = col_integer(),
##   Airline = col_character(),
##   FlightStatus = col_character(),
##   LosAngeles = col_integer(),
##   Phoenix = col_integer(),
##   SanDiego = col_integer(),
##   SanFrancisco = col_integer(),
##   Seattle = col_integer()
## )
glimpse(flights)
## Observations: 4
## Variables: 8
## $ AID          <int> 1, 2, 3, 4
## $ Airline      <chr> "ALASKA", "ALASKA", "AM WEST", "AM WEST"
## $ FlightStatus <chr> "on time", "delayed", "on time", "delayed"
## $ LosAngeles   <int> 497, 62, 694, 117
## $ Phoenix      <int> 221, 12, 4840, 415
## $ SanDiego     <int> 212, 20, 383, 65
## $ SanFrancisco <int> 503, 102, 320, 129
## $ Seattle      <int> 1841, 305, 201, 61

The flights tibble has 4 observations and 8 variables. In order to create a clean environment, we will remove the AID primary key variable and then use the kable function to display the contents of the tibble.

flights <- subset(flights,select=-AID)
kable(flights)
Airline FlightStatus LosAngeles Phoenix SanDiego SanFrancisco Seattle
ALASKA on time 497 221 212 503 1841
ALASKA delayed 62 12 20 102 305
AM WEST on time 694 4840 383 320 201
AM WEST delayed 117 415 65 129 61

Use the gather() function to condense the original 4 observations & 7 variable tibble to a tibble with 20 observations & 4 variables and create a tibble entitled flights_gather

flights_gather <- flights %>% gather(Destination,NumFlights,-Airline,-FlightStatus)
kable(flights_gather)
Airline FlightStatus Destination NumFlights
ALASKA on time LosAngeles 497
ALASKA delayed LosAngeles 62
AM WEST on time LosAngeles 694
AM WEST delayed LosAngeles 117
ALASKA on time Phoenix 221
ALASKA delayed Phoenix 12
AM WEST on time Phoenix 4840
AM WEST delayed Phoenix 415
ALASKA on time SanDiego 212
ALASKA delayed SanDiego 20
AM WEST on time SanDiego 383
AM WEST delayed SanDiego 65
ALASKA on time SanFrancisco 503
ALASKA delayed SanFrancisco 102
AM WEST on time SanFrancisco 320
AM WEST delayed SanFrancisco 129
ALASKA on time Seattle 1841
ALASKA delayed Seattle 305
AM WEST on time Seattle 201
AM WEST delayed Seattle 61

Next, we use the spread() function to condense flights_long to a tibble with 10 observations & 4 variables named flights_spread

flights_spread <- flights_gather %>% spread(FlightStatus,NumFlights)
kable(flights_spread)
Airline Destination delayed on time
ALASKA LosAngeles 62 497
ALASKA Phoenix 12 221
ALASKA SanDiego 20 212
ALASKA SanFrancisco 102 503
ALASKA Seattle 305 1841
AM WEST LosAngeles 117 694
AM WEST Phoenix 415 4840
AM WEST SanDiego 65 383
AM WEST SanFrancisco 129 320
AM WEST Seattle 61 201

Analyze the data

In order to analyze the data first the glimpse function was used in order to see if the delayed and on time variables were integers.

After utilizing the spread() function we see that on time needs to be renamed which was done immediately after the glimpse function.

glimpse(flights_spread)
## Observations: 10
## Variables: 4
## $ Airline     <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", ...
## $ Destination <chr> "LosAngeles", "Phoenix", "SanDiego", "SanFrancisco...
## $ delayed     <int> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ `on time`   <int> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
colnames (flights_spread) <- c("Airline", "Destination", "Delayed","OnTime")
names(flights_spread)
## [1] "Airline"     "Destination" "Delayed"     "OnTime"

Next we create a variable entitled TotFlights which adds the Delayed and OnTime variables.

flights_spread <- flights_spread %>% mutate(TotFlights = Delayed + OnTime)
flights_spread %>% select(Airline,Destination,TotFlights)
## # A tibble: 10 x 3
##    Airline Destination  TotFlights
##    <chr>   <chr>             <int>
##  1 ALASKA  LosAngeles          559
##  2 ALASKA  Phoenix             233
##  3 ALASKA  SanDiego            232
##  4 ALASKA  SanFrancisco        605
##  5 ALASKA  Seattle            2146
##  6 AM WEST LosAngeles          811
##  7 AM WEST Phoenix            5255
##  8 AM WEST SanDiego            448
##  9 AM WEST SanFrancisco        449
## 10 AM WEST Seattle             262

Next we create a variable entitled DelayedRatio which divides the Delayed variable by TotFlights

flights_spread <- flights_spread %>% mutate(DelayedRatio = Delayed/TotFlights) 
flights_spread %>% select(Airline,Destination,DelayedRatio)
## # A tibble: 10 x 3
##    Airline Destination  DelayedRatio
##    <chr>   <chr>               <dbl>
##  1 ALASKA  LosAngeles         0.111 
##  2 ALASKA  Phoenix            0.0515
##  3 ALASKA  SanDiego           0.0862
##  4 ALASKA  SanFrancisco       0.169 
##  5 ALASKA  Seattle            0.142 
##  6 AM WEST LosAngeles         0.144 
##  7 AM WEST Phoenix            0.0790
##  8 AM WEST SanDiego           0.145 
##  9 AM WEST SanFrancisco       0.287 
## 10 AM WEST Seattle            0.233

Next we create a variable entitled OnTimeRatio which divides the OnTime variable by TotFlights

flights_spread <- flights_spread %>% mutate(OnTimeRatio = OnTime/TotFlights) 
flights_spread %>% select(Airline,Destination,OnTimeRatio)
## # A tibble: 10 x 3
##    Airline Destination  OnTimeRatio
##    <chr>   <chr>              <dbl>
##  1 ALASKA  LosAngeles         0.889
##  2 ALASKA  Phoenix            0.948
##  3 ALASKA  SanDiego           0.914
##  4 ALASKA  SanFrancisco       0.831
##  5 ALASKA  Seattle            0.858
##  6 AM WEST LosAngeles         0.856
##  7 AM WEST Phoenix            0.921
##  8 AM WEST SanDiego           0.855
##  9 AM WEST SanFrancisco       0.713
## 10 AM WEST Seattle            0.767

Analysis

Top Five Airline/Destination combo - Delayed Flights

LowDel5 <- arrange(flights_spread, Delayed)
LowDel5 %>% select(Airline:Delayed) %>% slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination Delayed
##   <chr>   <chr>         <int>
## 1 ALASKA  Phoenix          12
## 2 ALASKA  SanDiego         20
## 3 AM WEST Seattle          61
## 4 ALASKA  LosAngeles       62
## 5 AM WEST SanDiego         65

Bottom Five Airline/Destination combo - Delayed Flights

TopDel5 <- arrange(flights_spread,desc (Delayed))
TopDel5 %>% select(Airline:Delayed) %>% slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination  Delayed
##   <chr>   <chr>          <int>
## 1 AM WEST Phoenix          415
## 2 ALASKA  Seattle          305
## 3 AM WEST SanFrancisco     129
## 4 AM WEST LosAngeles       117
## 5 ALASKA  SanFrancisco     102

Bottom Five Airline/Destination combo - On Time Flights

LowOnTime5 <- arrange(flights_spread, OnTime)
LowOnTime5 %>% select(Airline,Destination,OnTime) %>%
slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination  OnTime
##   <chr>   <chr>         <int>
## 1 AM WEST Seattle         201
## 2 ALASKA  SanDiego        212
## 3 ALASKA  Phoenix         221
## 4 AM WEST SanFrancisco    320
## 5 AM WEST SanDiego        383

Top Five Airline/Destination combo - On Time Flights

TopOnTime5 <- arrange(flights_spread, desc(OnTime))
TopOnTime5 %>% select(Airline,Destination,OnTime) %>%
slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination  OnTime
##   <chr>   <chr>         <int>
## 1 AM WEST Phoenix        4840
## 2 ALASKA  Seattle        1841
## 3 AM WEST LosAngeles      694
## 4 ALASKA  SanFrancisco    503
## 5 ALASKA  LosAngeles      497

Bottom Five Airline/Destination combo - Total Number of Flight

LowTotFlights5 <- arrange(flights_spread, TotFlights)
LowTotFlights5 %>% select(Airline,Destination,TotFlights) %>%
slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination  TotFlights
##   <chr>   <chr>             <int>
## 1 ALASKA  SanDiego            232
## 2 ALASKA  Phoenix             233
## 3 AM WEST Seattle             262
## 4 AM WEST SanDiego            448
## 5 AM WEST SanFrancisco        449

Top Five Airline/Destination combo - Total Number of Flight

TopTotFlights5 <- arrange(flights_spread, desc(TotFlights))
TopTotFlights5 %>% select(Airline,Destination,TotFlights) %>%
slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination  TotFlights
##   <chr>   <chr>             <int>
## 1 AM WEST Phoenix            5255
## 2 ALASKA  Seattle            2146
## 3 AM WEST LosAngeles          811
## 4 ALASKA  SanFrancisco        605
## 5 ALASKA  LosAngeles          559

Top Five Airline/Destination combo - Delayed Ratio

LowDelRats5 <- arrange(flights_spread, DelayedRatio)
LowDelRats5 %>% select(Airline,Destination,DelayedRatio) %>%
slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination DelayedRatio
##   <chr>   <chr>              <dbl>
## 1 ALASKA  Phoenix           0.0515
## 2 AM WEST Phoenix           0.0790
## 3 ALASKA  SanDiego          0.0862
## 4 ALASKA  LosAngeles        0.111 
## 5 ALASKA  Seattle           0.142

Bottom Five Airline/Destination combo - Delayed Ratio

TopDelRats5 <- arrange(flights_spread, desc(DelayedRatio))
TopDelRats5 %>% select(Airline,Destination,DelayedRatio) %>%
slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination  DelayedRatio
##   <chr>   <chr>               <dbl>
## 1 AM WEST SanFrancisco        0.287
## 2 AM WEST Seattle             0.233
## 3 ALASKA  SanFrancisco        0.169
## 4 AM WEST SanDiego            0.145
## 5 AM WEST LosAngeles          0.144

Bottom Five Airline/Destination combo - On Time Ratio

LowOnTimeRat5 <- arrange(flights_spread, OnTimeRatio)
LowOnTimeRat5 %>% select(Airline,Destination,OnTimeRatio) %>%
slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination  OnTimeRatio
##   <chr>   <chr>              <dbl>
## 1 AM WEST SanFrancisco       0.713
## 2 AM WEST Seattle            0.767
## 3 ALASKA  SanFrancisco       0.831
## 4 AM WEST SanDiego           0.855
## 5 AM WEST LosAngeles         0.856

Top Five Airline/Destination combo - On Time Ratio

TopOnTimeRat5 <- arrange(flights_spread, desc(OnTimeRatio))
TopOnTimeRat5 %>% select(Airline,Destination,OnTimeRatio) %>%
slice(1:5)
## # A tibble: 5 x 3
##   Airline Destination OnTimeRatio
##   <chr>   <chr>             <dbl>
## 1 ALASKA  Phoenix           0.948
## 2 AM WEST Phoenix           0.921
## 3 ALASKA  SanDiego          0.914
## 4 ALASKA  LosAngeles        0.889
## 5 ALASKA  Seattle           0.858

Delayed Data Element

delayed <- flights_spread %>% group_by(Airline) %>% summarise(DelayedMean=mean(Delayed), DelayedSD=sd(Delayed), DelayedMed=median(Delayed))
delayed
## # A tibble: 2 x 4
##   Airline DelayedMean DelayedSD DelayedMed
##   <chr>         <dbl>     <dbl>      <int>
## 1 ALASKA          100       120         62
## 2 AM WEST         157       147        117
boxplot(flights_spread$Delayed ~ flights_spread$Airline, main = "Delayed Flights Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of Delayed Flights", col = c("yellow","beige"))

barplot(delayed$DelayedMean, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: Delayed Flights Mean", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")

On Time Data Element

on_time <- flights_spread %>% group_by(Airline) %>% summarise(OnTimeMean=mean(OnTime), OnTimeSD=sd(OnTime), OnTimeMed=median(OnTime))
on_time
## # A tibble: 2 x 4
##   Airline OnTimeMean OnTimeSD OnTimeMed
##   <chr>        <dbl>    <dbl>     <int>
## 1 ALASKA         655      678       497
## 2 AM WEST       1288     1994       383
boxplot(flights_spread$OnTime ~ flights_spread$Airline, main = "On Time Flights Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of On Time Flights", col = c("yellow","beige"))

barplot(on_time$OnTimeMed, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: On Time Flights Median", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")

Total Flights Data Element

total_flights <- flights_spread %>% group_by(Airline) %>% summarise(TotFlightsMean=mean(TotFlights), TotFlightsSD=sd(TotFlights), TotFlightsMed=median(TotFlights))
total_flights
## # A tibble: 2 x 4
##   Airline TotFlightsMean TotFlightsSD TotFlightsMed
##   <chr>            <dbl>        <dbl>         <int>
## 1 ALASKA             755          797           559
## 2 AM WEST           1445         2139           449
boxplot(flights_spread$TotFlights ~ flights_spread$Airline, main = "Total Flights Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of Flights", col = c("yellow","beige"))

barplot(total_flights$TotFlightsMed, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: Total Number of  Flights Median", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")

Delayed Ratio Data Element

delayed_ratio <- flights_spread %>% group_by(Airline) %>% summarise(DelRatMean=mean(DelayedRatio), DelRatSD=sd(DelayedRatio), DelRatMed=median(DelayedRatio))
delayed_ratio
## # A tibble: 2 x 4
##   Airline DelRatMean DelRatSD DelRatMed
##   <chr>        <dbl>    <dbl>     <dbl>
## 1 ALASKA       0.112   0.0459     0.111
## 2 AM WEST      0.178   0.0821     0.145
boxplot(flights_spread$DelayedRatio ~ flights_spread$Airline, main = "Delayed Ratio Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of Flights", col = c("yellow","beige"))

round(delayed_ratio$DelRatMean,digits = 2)
## [1] 0.11 0.18
barplot(delayed_ratio$DelRatMean, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: Delayed Ratio", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")

On Time Ratio Data Element

ontime_ratio <- flights_spread %>% group_by(Airline) %>% summarise(OnTRatMean=mean(OnTimeRatio), OnTRatSD=sd(OnTimeRatio), OnTRatMed=median(OnTimeRatio))
ontime_ratio
## # A tibble: 2 x 4
##   Airline OnTRatMean OnTRatSD OnTRatMed
##   <chr>        <dbl>    <dbl>     <dbl>
## 1 ALASKA       0.888   0.0459     0.889
## 2 AM WEST      0.822   0.0821     0.855
boxplot(flights_spread$OnTimeRatio ~ flights_spread$Airline, main = "On Time Ratio Box Plot ALASKA / AM WEST Airlines", xlab = "Airlines", ylab = "Number of Flights", col = c("yellow","beige"))

round(ontime_ratio$OnTRatMed,digits = 2)
## [1] 0.89 0.85
barplot(ontime_ratio$OnTRatMed, horiz=FALSE, col = c("yellow","beige"),main="ALASKA / AM West: Delayed Ratio Median", names.arg=c("ALASKA","AM WEST"), xlab = "Airlines")

Creating the TotFlights,DelayedRatio, and OnTimeRatio allows us to explore deeper and see how the the airlines have a close median of on time and delayed flights.