Package Installs

install.packages(“tidyr”)
install.packages(“dplyr”) install.packages(“ggplot2”)

require(tidyr)
require(dplyr)
require(ggplot2)
require(tibble)

Load the source .CSV file

df <- read.csv("https://raw.githubusercontent.com/simon63/Assignment5/master/flights-wk5.csv",
               header = T, stringsAsFactors = F, fileEncoding = "UTF-8-BOM", check.names = F)
# header = T - using this option to use the first line for column names
# check.names = F - using this option to avoid "Los Angeles" being converted to "Los.Angeles"
# fileEncoding = "UTF-8-BOM" - using this option to avoid loading the "\ufeff" encoding character
df
##                   Los Angeles Phoenix San Diego San Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Tidy Data - Assign missing column names

# Missing column names must be assigned before tbl_df() can be used
names(df)[1:2] <- c("Airline", "n_type")

# When attempting to use tbl_df(), I noticed a pop-up description, saying:
#   "Deprecated: please use tibble::as_tibble() instead."
df <- as_tibble(df)
df
## # A tibble: 5 x 7
##   Airline n_type `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>          <int>   <int>       <int>           <int>   <int>
## 1 ALASKA  on ti~           497     221         212             503    1841
## 2 ""      delay~            62      12          20             102     305
## 3 ""      ""                NA      NA          NA              NA      NA
## 4 AM WEST on ti~           694    4840         383             320     201
## 5 ""      delay~           117     415          65             129      61

Tidy Data - Convert columns with city names into a single variable, “city”

# Using gather() to do the conversion and 
# also filtering out rows with NA value in the new "n" column
t <- gather(df, "city", "n", 3:7) %>% filter(!is.na(n))
t
## # A tibble: 20 x 4
##    Airline n_type  city              n
##    <chr>   <chr>   <chr>         <int>
##  1 ALASKA  on time Los Angeles     497
##  2 ""      delayed Los Angeles      62
##  3 AM WEST on time Los Angeles     694
##  4 ""      delayed Los Angeles     117
##  5 ALASKA  on time Phoenix         221
##  6 ""      delayed Phoenix          12
##  7 AM WEST on time Phoenix        4840
##  8 ""      delayed Phoenix         415
##  9 ALASKA  on time San Diego       212
## 10 ""      delayed San Diego        20
## 11 AM WEST on time San Diego       383
## 12 ""      delayed San Diego        65
## 13 ALASKA  on time San Francisco   503
## 14 ""      delayed San Francisco   102
## 15 AM WEST on time San Francisco   320
## 16 ""      delayed San Francisco   129
## 17 ALASKA  on time Seattle        1841
## 18 ""      delayed Seattle         305
## 19 AM WEST on time Seattle         201
## 20 ""      delayed Seattle          61

Tidy Data - Assign missing values for Airline in even rows

# Assign the Airline values from the "Odd" rows into the "Even" rows
t[(c(1:length(t)) %% 2) == 0,1] <- t[(c(1:length(t)) %% 2) != 0,1]
t
## # A tibble: 20 x 4
##    Airline n_type  city              n
##    <chr>   <chr>   <chr>         <int>
##  1 ALASKA  on time Los Angeles     497
##  2 ALASKA  delayed Los Angeles      62
##  3 AM WEST on time Los Angeles     694
##  4 AM WEST delayed Los Angeles     117
##  5 ALASKA  on time Phoenix         221
##  6 ALASKA  delayed Phoenix          12
##  7 AM WEST on time Phoenix        4840
##  8 AM WEST delayed Phoenix         415
##  9 ALASKA  on time San Diego       212
## 10 ALASKA  delayed San Diego        20
## 11 AM WEST on time San Diego       383
## 12 AM WEST delayed San Diego        65
## 13 ALASKA  on time San Francisco   503
## 14 ALASKA  delayed San Francisco   102
## 15 AM WEST on time San Francisco   320
## 16 AM WEST delayed San Francisco   129
## 17 ALASKA  on time Seattle        1841
## 18 ALASKA  delayed Seattle         305
## 19 AM WEST on time Seattle         201
## 20 AM WEST delayed Seattle          61

Tidy Data - Use spread() function to separate “on time” and “delayed” observations into new columns

# Spread would not work without the prior step of assigning missing Airline values
t <- spread(t, n_type, n)
t
## # A tibble: 10 x 4
##    Airline city          delayed `on time`
##    <chr>   <chr>           <int>     <int>
##  1 ALASKA  Los Angeles        62       497
##  2 ALASKA  Phoenix            12       221
##  3 ALASKA  San Diego          20       212
##  4 ALASKA  San Francisco     102       503
##  5 ALASKA  Seattle           305      1841
##  6 AM WEST Los Angeles       117       694
##  7 AM WEST Phoenix           415      4840
##  8 AM WEST San Diego          65       383
##  9 AM WEST San Francisco     129       320
## 10 AM WEST Seattle            61       201

Tidy Data - Mutate by adding two new columns, “% on time” and “% delayed”

t <- mutate(t, pct_delayed = round((delayed / (delayed + t$'on time')) * 100, 2), pct_on_time = 100 - pct_delayed)
t
## # A tibble: 10 x 6
##    Airline city          delayed `on time` pct_delayed pct_on_time
##    <chr>   <chr>           <int>     <int>       <dbl>       <dbl>
##  1 ALASKA  Los Angeles        62       497       11.1         88.9
##  2 ALASKA  Phoenix            12       221        5.15        94.8
##  3 ALASKA  San Diego          20       212        8.62        91.4
##  4 ALASKA  San Francisco     102       503       16.9         83.1
##  5 ALASKA  Seattle           305      1841       14.2         85.8
##  6 AM WEST Los Angeles       117       694       14.4         85.6
##  7 AM WEST Phoenix           415      4840        7.90        92.1
##  8 AM WEST San Diego          65       383       14.5         85.5
##  9 AM WEST San Francisco     129       320       28.7         71.3
## 10 AM WEST Seattle            61       201       23.3         76.7

Plot Data - Comparing delays for the two airlines