Assignment – Tidying and Transforming Data

Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. Perform analysis to compare the arrival delays for the two airlines.

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
initial_df <- read_csv("C:/Data 607/flights_db.csv")
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   `Los Angeles` = col_integer(),
##   Phoenix = col_integer(),
##   `San Diego` = col_integer(),
##   `San Francisco` = col_integer(),
##   Seattle = col_integer()
## )
dim(initial_df)
## [1] 5 7

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

names(initial_df)<- c("Airline", "Time_Indicator", "Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle" )
initial_df <- tbl_df(initial_df)
knitr::kable(initial_df,  caption = "Initial Airline Table")
Initial Airline Table
Airline Time_Indicator Los Angeles Phoenix San Diego San Francisco Seattle
ALASKA on time 497 221 212 503 1841
NA delayed 62 12 20 102 305
NA NA NA NA NA NA NA
AM WEST on time 694 4840 383 320 201
NA delayed 117 415 65 129 61
#remove the null row with the filter function and replaced the missing value in the first column
summary_df <- (filter(initial_df, Time_Indicator!= ""))
summary_df[2, "Airline"]  <- "ALASKA"
summary_df[4, "Airline"]  <- "AM WEST"

knitr::kable(summary_df,  caption = "Cleaned Table")
Cleaned Table
Airline Time_Indicator Los Angeles Phoenix San Diego San Francisco Seattle
ALASKA on time 497 221 212 503 1841
ALASKA delayed 62 12 20 102 305
AM WEST on time 694 4840 383 320 201
AM WEST delayed 117 415 65 129 61
#use the gather function to reformat the table
summary_df <- gather(summary_df, city, number_flights, c("Los Angeles", "Phoenix", "San Diego", "San Francisco","Seattle"))

knitr::kable(summary_df,  caption = "Tidyr Transformed Table") 
Tidyr Transformed Table
Airline Time_Indicator city number_flights
ALASKA on time Los Angeles 497
ALASKA delayed Los Angeles 62
AM WEST on time Los Angeles 694
AM WEST delayed Los Angeles 117
ALASKA on time Phoenix 221
ALASKA delayed Phoenix 12
AM WEST on time Phoenix 4840
AM WEST delayed Phoenix 415
ALASKA on time San Diego 212
ALASKA delayed San Diego 20
AM WEST on time San Diego 383
AM WEST delayed San Diego 65
ALASKA on time San Francisco 503
ALASKA delayed San Francisco 102
AM WEST on time San Francisco 320
AM WEST delayed San Francisco 129
ALASKA on time Seattle 1841
ALASKA delayed Seattle 305
AM WEST on time Seattle 201
AM WEST delayed Seattle 61
#filter the "gathered" dataset into 2 for the airlines
alaska_df<- (filter(summary_df, Airline== "ALASKA"))
amwest_df<- (filter(summary_df, Airline== "AM WEST"))

knitr::kable(alaska_df,  caption = "Alaska Table") 
Alaska Table
Airline Time_Indicator city number_flights
ALASKA on time Los Angeles 497
ALASKA delayed Los Angeles 62
ALASKA on time Phoenix 221
ALASKA delayed Phoenix 12
ALASKA on time San Diego 212
ALASKA delayed San Diego 20
ALASKA on time San Francisco 503
ALASKA delayed San Francisco 102
ALASKA on time Seattle 1841
ALASKA delayed Seattle 305
knitr::kable(amwest_df,  caption = "AM WEST Table") 
AM WEST Table
Airline Time_Indicator city number_flights
AM WEST on time Los Angeles 694
AM WEST delayed Los Angeles 117
AM WEST on time Phoenix 4840
AM WEST delayed Phoenix 415
AM WEST on time San Diego 383
AM WEST delayed San Diego 65
AM WEST on time San Francisco 320
AM WEST delayed San Francisco 129
AM WEST on time Seattle 201
AM WEST delayed Seattle 61
#aggregate Alaska
agg_alaska <- select(alaska_df,Airline,  city,  number_flights)
agg_alaska <-   group_by(agg_alaska, city)
(agg_alaska <- summarise(agg_alaska , total_flights =sum(number_flights)))

A tibble: 5 x 2

city total_flights 1 Los Angeles 559 2 Phoenix 233 3 San Diego 232 4 San Francisco 605 5 Seattle 2146

knitr::kable(agg_alaska,  caption = "Aggregate Alaska Table-By City") 
Aggregate Alaska Table-By City
city total_flights
Los Angeles 559
Phoenix 233
San Diego 232
San Francisco 605
Seattle 2146
#aggregate AM WEST
agg_amwest <- select(amwest_df,Airline,  city,  number_flights)
agg_amwest<-    group_by(amwest_df, city)
(agg_amwest <- summarise(agg_amwest , total_flights =sum(number_flights)))

A tibble: 5 x 2

city total_flights 1 Los Angeles 811 2 Phoenix 5255 3 San Diego 448 4 San Francisco 449 5 Seattle 262

knitr::kable(agg_amwest,  caption = "Aggregate AM WEST Table-By City") 
Aggregate AM WEST Table-By City
city total_flights
Los Angeles 811
Phoenix 5255
San Diego 448
San Francisco 449
Seattle 262
#merged the summarized dataframe onto the regular dataframe to calculate %total delayed and on time flights per city
alaska_df<- merge(x = alaska_df, y = agg_alaska , by = "city", all = TRUE) 
alaska_df<- mutate(alaska_df,
       percent_time_indicator = (number_flights / total_flights)*100)
alaska_df<-select(alaska_df,Airline,  Time_Indicator,city,  number_flights,percent_time_indicator )
knitr::kable(alaska_df,  caption = "Consolidated Alaska Table") 
Consolidated Alaska Table
Airline Time_Indicator city number_flights percent_time_indicator
ALASKA on time Los Angeles 497 88.908766
ALASKA delayed Los Angeles 62 11.091234
ALASKA on time Phoenix 221 94.849785
ALASKA delayed Phoenix 12 5.150215
ALASKA on time San Diego 212 91.379310
ALASKA delayed San Diego 20 8.620690
ALASKA on time San Francisco 503 83.140496
ALASKA delayed San Francisco 102 16.859504
ALASKA on time Seattle 1841 85.787512
ALASKA delayed Seattle 305 14.212488
#merged the summarized dataframe onto the regular dataframe to calculate %total delayed and on time flights per city
amwest_df<- merge(x = amwest_df, y = agg_amwest , by = "city", all = TRUE) 
amwest_df<- mutate(amwest_df,
       percent_time_indicator = (number_flights / total_flights)*100)
amwest_df<-select(amwest_df,Airline,  Time_Indicator,city,  number_flights,percent_time_indicator )
knitr::kable(amwest_df,  caption = "Consolidated AM WEST Table")
Consolidated AM WEST Table
Airline Time_Indicator city number_flights percent_time_indicator
AM WEST on time Los Angeles 694 85.573366
AM WEST delayed Los Angeles 117 14.426634
AM WEST on time Phoenix 4840 92.102759
AM WEST delayed Phoenix 415 7.897241
AM WEST on time San Diego 383 85.491071
AM WEST delayed San Diego 65 14.508929
AM WEST on time San Francisco 320 71.269488
AM WEST delayed San Francisco 129 28.730512
AM WEST on time Seattle 201 76.717557
AM WEST delayed Seattle 61 23.282443
#merged the Alaska and Am WEST tables together 
merged_df <- left_join(alaska_df,amwest_df, by=c("Time_Indicator","city"), all = TRUE)
knitr::kable(merged_df,  caption = "Merged Table")
Merged Table
Airline.x Time_Indicator city number_flights.x percent_time_indicator.x Airline.y number_flights.y percent_time_indicator.y
ALASKA on time Los Angeles 497 88.908766 AM WEST 694 85.573366
ALASKA delayed Los Angeles 62 11.091234 AM WEST 117 14.426634
ALASKA on time Phoenix 221 94.849785 AM WEST 4840 92.102759
ALASKA delayed Phoenix 12 5.150215 AM WEST 415 7.897241
ALASKA on time San Diego 212 91.379310 AM WEST 383 85.491071
ALASKA delayed San Diego 20 8.620690 AM WEST 65 14.508929
ALASKA on time San Francisco 503 83.140496 AM WEST 320 71.269488
ALASKA delayed San Francisco 102 16.859504 AM WEST 129 28.730512
ALASKA on time Seattle 1841 85.787512 AM WEST 201 76.717557
ALASKA delayed Seattle 305 14.212488 AM WEST 61 23.282443
delayed_df <- (filter(merged_df, Time_Indicator== "delayed"))
delayed_df <- mutate(delayed_df ,
       airline_of_choice = (percent_time_indicator.x -percent_time_indicator.y))
delayed_df <- mutate(delayed_df ,
       airline_of_choice = if_else(airline_of_choice < 1, "ALASKA", "AM WEST"))
delayed_df  <- tbl_df(delayed_df)

delayed_df <- select(delayed_df, city, percent_time_indicator.x ,percent_time_indicator.y , airline_of_choice )

names(delayed_df)<- c("City", "Alaska Delay %", "AM West Delay %", "Preferred Airline" )
knitr::kable(delayed_df,  caption = "Delayed Table")
Delayed Table
City Alaska Delay % AM West Delay % Preferred Airline
Los Angeles 11.091234 14.426634 ALASKA
Phoenix 5.150215 7.897241 ALASKA
San Diego 8.620690 14.508929 ALASKA
San Francisco 16.859504 28.730512 ALASKA
Seattle 14.212488 23.282443 ALASKA
ontime_df <- (filter(merged_df, Time_Indicator== "on time"))
ontime_df <- mutate(ontime_df ,
       airline_of_choice = (percent_time_indicator.x -percent_time_indicator.y))
ontime_df <- mutate(ontime_df ,
       airline_of_choice = if_else(airline_of_choice > 1, "ALASKA", "AM WEST"))
ontime_df  <- tbl_df(ontime_df)

ontime_df <- select(ontime_df, city, percent_time_indicator.x ,percent_time_indicator.y , airline_of_choice )

names(ontime_df)<- c("City", "Alaska On Time %", "AM West On Time %", "Preferred Airline" )
knitr::kable(ontime_df,  caption = "On Time Table")
On Time Table
City Alaska On Time % AM West On Time % Preferred Airline
Los Angeles 88.90877 85.57337 ALASKA
Phoenix 94.84979 92.10276 ALASKA
San Diego 91.37931 85.49107 ALASKA
San Francisco 83.14050 71.26949 ALASKA
Seattle 85.78751 76.71756 ALASKA