Using dplyr and tidyr packages to transform and tidy data.

require(dplyr)
require(tidyr)
require(utils)
require(ggplot2)
library(dplyr)
library(tidyr)
library(utils)
library(ggplot2)

Read the flights CSV file from URL.

flights <- read.csv(url("https://raw.githubusercontent.com/v-sinha/data607/week_05/flights.csv"))

Display the structure and leading rows of the frame.

# Structure
str(flights)
## 'data.frame':    4 obs. of  7 variables:
##  $ Airline      : Factor w/ 2 levels "ALASKA","AM WEST": 1 1 2 2
##  $ flight.status: Factor w/ 2 levels " delayed"," on time": 2 1 2 1
##  $ Los.Angeles  : int  497 62 694 117
##  $ Phoenix      : int  221 12 4840 415
##  $ San.Diego    : int  212 20 383 65
##  $ San.Francisco: int  503 102 320 129
##  $ Seattle      : int  1841 305 201 61
# Alternative to str()
glimpse(flights)
## Observations: 4
## Variables: 7
## $ Airline       <fctr> ALASKA, ALASKA, AM WEST, AM WEST
## $ flight.status <fctr>  on time,  delayed,  on time,  delayed
## $ Los.Angeles   <int> 497, 62, 694, 117
## $ Phoenix       <int> 221, 12, 4840, 415
## $ San.Diego     <int> 212, 20, 383, 65
## $ San.Francisco <int> 503, 102, 320, 129
## $ Seattle       <int> 1841, 305, 201, 61
# Leading rows us
head(flights)
##   Airline flight.status Los.Angeles Phoenix San.Diego San.Francisco
## 1  ALASKA       on time         497     221       212           503
## 2  ALASKA       delayed          62      12        20           102
## 3 AM WEST       on time         694    4840       383           320
## 4 AM WEST       delayed         117     415        65           129
##   Seattle
## 1    1841
## 2     305
## 3     201
## 4      61

The flight information appears to be a wide data frame, because columns 3-7 contain values for each destination, i.e. the column fields are really values. We can use tidyr’s gather() function to transform the wide dataframe into a long one.

# Select columns 3-7 and consolidate (gather) their values into one
# column labeled "Flightcount"
flights_long <- flights %>% gather(Destination, Flightcount, 3:7)

# Display the new structure and the first few rows.
str(flights_long)
## 'data.frame':    20 obs. of  4 variables:
##  $ Airline      : Factor w/ 2 levels "ALASKA","AM WEST": 1 1 2 2 1 1 2 2 1 1 ...
##  $ flight.status: Factor w/ 2 levels " delayed"," on time": 2 1 2 1 2 1 2 1 2 1 ...
##  $ Destination  : chr  "Los.Angeles" "Los.Angeles" "Los.Angeles" "Los.Angeles" ...
##  $ Flightcount  : int  497 62 694 117 221 12 4840 415 212 20 ...
glimpse(flights_long)
## Observations: 20
## Variables: 4
## $ Airline       <fctr> ALASKA, ALASKA, AM WEST, AM WEST, ALASKA, ALASK...
## $ flight.status <fctr>  on time,  delayed,  on time,  delayed,  on tim...
## $ Destination   <chr> "Los.Angeles", "Los.Angeles", "Los.Angeles", "Lo...
## $ Flightcount   <int> 497, 62, 694, 117, 221, 12, 4840, 415, 212, 20, ...
head(flights_long)
##   Airline flight.status Destination Flightcount
## 1  ALASKA       on time Los.Angeles         497
## 2  ALASKA       delayed Los.Angeles          62
## 3 AM WEST       on time Los.Angeles         694
## 4 AM WEST       delayed Los.Angeles         117
## 5  ALASKA       on time     Phoenix         221
## 6  ALASKA       delayed     Phoenix          12

Compare the dimensions of the original and the transformed frames.

dim(flights)
## [1] 4 7
dim(flights_long)
## [1] 20  4
names(flights)
## [1] "Airline"       "flight.status" "Los.Angeles"   "Phoenix"      
## [5] "San.Diego"     "San.Francisco" "Seattle"
names(flights_long)
## [1] "Airline"       "flight.status" "Destination"   "Flightcount"

Compare the flights of the two airlines. For this we filter based on Airline and FlightStatus (i.e. “on time” or “delayed”).

# Extract the information for each of the two airlines.
flights_alaska <- flights_long %>% filter(Airline == "ALASKA")
flights_amwest <-  flights_long %>% filter(Airline == "AM WEST")

# Extract the on-time and delayed information for both airlines.
ontime_alaska <- flights_alaska %>% filter(flight.status == " on time")
ontime_amwest <- flights_amwest %>% filter(flight.status == " on time")

delayed_alaska <- flights_alaska %>% filter(flight.status == " delayed")
delayed_amwest <- flights_amwest %>% filter(flight.status == " delayed")


print(ontime_alaska)
##   Airline flight.status   Destination Flightcount
## 1  ALASKA       on time   Los.Angeles         497
## 2  ALASKA       on time       Phoenix         221
## 3  ALASKA       on time     San.Diego         212
## 4  ALASKA       on time San.Francisco         503
## 5  ALASKA       on time       Seattle        1841
print(ontime_amwest)
##   Airline flight.status   Destination Flightcount
## 1 AM WEST       on time   Los.Angeles         694
## 2 AM WEST       on time       Phoenix        4840
## 3 AM WEST       on time     San.Diego         383
## 4 AM WEST       on time San.Francisco         320
## 5 AM WEST       on time       Seattle         201
print(delayed_alaska)
##   Airline flight.status   Destination Flightcount
## 1  ALASKA       delayed   Los.Angeles          62
## 2  ALASKA       delayed       Phoenix          12
## 3  ALASKA       delayed     San.Diego          20
## 4  ALASKA       delayed San.Francisco         102
## 5  ALASKA       delayed       Seattle         305
print(delayed_amwest)
##   Airline flight.status   Destination Flightcount
## 1 AM WEST       delayed   Los.Angeles         117
## 2 AM WEST       delayed       Phoenix         415
## 3 AM WEST       delayed     San.Diego          65
## 4 AM WEST       delayed San.Francisco         129
## 5 AM WEST       delayed       Seattle          61

Compare the delay as a percentage for both airlines.

percent_alaska <- 100 * sum(delayed_alaska$Flightcount) / (sum(delayed_alaska$Flightcount) + sum(ontime_alaska$Flightcount))
percent_amwest <- 100 * sum(delayed_amwest$Flightcount) / (sum(delayed_amwest$Flightcount) + sum(ontime_amwest$Flightcount))

delays_percent <- data.frame(airline = c("ALASKA", "AM WEST"),
                             delays = c(percent_alaska, percent_amwest))

p <- ggplot(delays_percent, aes(airline, delays))
p +geom_bar(stat = "identity", width=.5)

The conclusion is that, for the selected destination cities, Alaska Airlines was, compared to America West, somewhat more prone to delays (13.2% vs 10.9%), with delays computed as a percentage of total flights completed for each airline.