Assignment – Tidying and Transforming Data
Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. Perform analysis to compare the arrival delays for the two airlines.
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
initial_df <- read_csv("C:/Data 607/flights_db.csv")
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## `Los Angeles` = col_integer(),
## Phoenix = col_integer(),
## `San Diego` = col_integer(),
## `San Francisco` = col_integer(),
## Seattle = col_integer()
## )
dim(initial_df)
## [1] 5 7
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
names(initial_df)<- c("Airline", "Time_Indicator", "Los Angeles", "Phoenix", "San Diego","San Francisco", "Seattle" )
initial_df <- tbl_df(initial_df)
knitr::kable(initial_df, caption = "Initial Airline Table")
Initial Airline Table
| ALASKA |
on time |
497 |
221 |
212 |
503 |
1841 |
| NA |
delayed |
62 |
12 |
20 |
102 |
305 |
| NA |
NA |
NA |
NA |
NA |
NA |
NA |
| AM WEST |
on time |
694 |
4840 |
383 |
320 |
201 |
| NA |
delayed |
117 |
415 |
65 |
129 |
61 |
#remove the null row with the filter function and replaced the missing value in the first column
summary_df <- (filter(initial_df, Time_Indicator!= ""))
summary_df[2, "Airline"] <- "ALASKA"
summary_df[4, "Airline"] <- "AM WEST"
knitr::kable(summary_df, caption = "Cleaned Table")
Cleaned Table
| ALASKA |
on time |
497 |
221 |
212 |
503 |
1841 |
| ALASKA |
delayed |
62 |
12 |
20 |
102 |
305 |
| AM WEST |
on time |
694 |
4840 |
383 |
320 |
201 |
| AM WEST |
delayed |
117 |
415 |
65 |
129 |
61 |
#use the gather function to reformat the table
summary_df <- gather(summary_df, city, number_flights, c("Los Angeles", "Phoenix", "San Diego", "San Francisco","Seattle"))
knitr::kable(summary_df, caption = "Tidyr Transformed Table")
Tidyr Transformed Table
| ALASKA |
on time |
Los Angeles |
497 |
| ALASKA |
delayed |
Los Angeles |
62 |
| AM WEST |
on time |
Los Angeles |
694 |
| AM WEST |
delayed |
Los Angeles |
117 |
| ALASKA |
on time |
Phoenix |
221 |
| ALASKA |
delayed |
Phoenix |
12 |
| AM WEST |
on time |
Phoenix |
4840 |
| AM WEST |
delayed |
Phoenix |
415 |
| ALASKA |
on time |
San Diego |
212 |
| ALASKA |
delayed |
San Diego |
20 |
| AM WEST |
on time |
San Diego |
383 |
| AM WEST |
delayed |
San Diego |
65 |
| ALASKA |
on time |
San Francisco |
503 |
| ALASKA |
delayed |
San Francisco |
102 |
| AM WEST |
on time |
San Francisco |
320 |
| AM WEST |
delayed |
San Francisco |
129 |
| ALASKA |
on time |
Seattle |
1841 |
| ALASKA |
delayed |
Seattle |
305 |
| AM WEST |
on time |
Seattle |
201 |
| AM WEST |
delayed |
Seattle |
61 |
#filter the "gathered" dataset into 2 for the airlines
alaska_df<- (filter(summary_df, Airline== "ALASKA"))
amwest_df<- (filter(summary_df, Airline== "AM WEST"))
knitr::kable(alaska_df, caption = "Alaska Table")
Alaska Table
| ALASKA |
on time |
Los Angeles |
497 |
| ALASKA |
delayed |
Los Angeles |
62 |
| ALASKA |
on time |
Phoenix |
221 |
| ALASKA |
delayed |
Phoenix |
12 |
| ALASKA |
on time |
San Diego |
212 |
| ALASKA |
delayed |
San Diego |
20 |
| ALASKA |
on time |
San Francisco |
503 |
| ALASKA |
delayed |
San Francisco |
102 |
| ALASKA |
on time |
Seattle |
1841 |
| ALASKA |
delayed |
Seattle |
305 |
knitr::kable(amwest_df, caption = "AM WEST Table")
AM WEST Table
| AM WEST |
on time |
Los Angeles |
694 |
| AM WEST |
delayed |
Los Angeles |
117 |
| AM WEST |
on time |
Phoenix |
4840 |
| AM WEST |
delayed |
Phoenix |
415 |
| AM WEST |
on time |
San Diego |
383 |
| AM WEST |
delayed |
San Diego |
65 |
| AM WEST |
on time |
San Francisco |
320 |
| AM WEST |
delayed |
San Francisco |
129 |
| AM WEST |
on time |
Seattle |
201 |
| AM WEST |
delayed |
Seattle |
61 |
#aggregate Alaska
agg_alaska <- select(alaska_df,Airline, city, number_flights)
agg_alaska <- group_by(agg_alaska, city)
(agg_alaska <- summarise(agg_alaska , total_flights =sum(number_flights)))
A tibble: 5 x 2
city total_flights 1 Los Angeles 559 2 Phoenix 233 3 San Diego 232 4 San Francisco 605 5 Seattle 2146
knitr::kable(agg_alaska, caption = "Aggregate Alaska Table-By City")
Aggregate Alaska Table-By City
| Los Angeles |
559 |
| Phoenix |
233 |
| San Diego |
232 |
| San Francisco |
605 |
| Seattle |
2146 |
#aggregate AM WEST
agg_amwest <- select(amwest_df,Airline, city, number_flights)
agg_amwest<- group_by(amwest_df, city)
(agg_amwest <- summarise(agg_amwest , total_flights =sum(number_flights)))
A tibble: 5 x 2
city total_flights 1 Los Angeles 811 2 Phoenix 5255 3 San Diego 448 4 San Francisco 449 5 Seattle 262
knitr::kable(agg_amwest, caption = "Aggregate AM WEST Table-By City")
Aggregate AM WEST Table-By City
| Los Angeles |
811 |
| Phoenix |
5255 |
| San Diego |
448 |
| San Francisco |
449 |
| Seattle |
262 |
#merged the summarized dataframe onto the regular dataframe to calculate %total delayed and on time flights per city
alaska_df<- merge(x = alaska_df, y = agg_alaska , by = "city", all = TRUE)
alaska_df<- mutate(alaska_df,
percent_time_indicator = (number_flights / total_flights)*100)
alaska_df<-select(alaska_df,Airline, Time_Indicator,city, number_flights,percent_time_indicator )
knitr::kable(alaska_df, caption = "Consolidated Alaska Table")
Consolidated Alaska Table
| ALASKA |
on time |
Los Angeles |
497 |
88.908766 |
| ALASKA |
delayed |
Los Angeles |
62 |
11.091234 |
| ALASKA |
on time |
Phoenix |
221 |
94.849785 |
| ALASKA |
delayed |
Phoenix |
12 |
5.150215 |
| ALASKA |
on time |
San Diego |
212 |
91.379310 |
| ALASKA |
delayed |
San Diego |
20 |
8.620690 |
| ALASKA |
on time |
San Francisco |
503 |
83.140496 |
| ALASKA |
delayed |
San Francisco |
102 |
16.859504 |
| ALASKA |
on time |
Seattle |
1841 |
85.787512 |
| ALASKA |
delayed |
Seattle |
305 |
14.212488 |
#merged the summarized dataframe onto the regular dataframe to calculate %total delayed and on time flights per city
amwest_df<- merge(x = amwest_df, y = agg_amwest , by = "city", all = TRUE)
amwest_df<- mutate(amwest_df,
percent_time_indicator = (number_flights / total_flights)*100)
amwest_df<-select(amwest_df,Airline, Time_Indicator,city, number_flights,percent_time_indicator )
knitr::kable(amwest_df, caption = "Consolidated AM WEST Table")
Consolidated AM WEST Table
| AM WEST |
on time |
Los Angeles |
694 |
85.573366 |
| AM WEST |
delayed |
Los Angeles |
117 |
14.426634 |
| AM WEST |
on time |
Phoenix |
4840 |
92.102759 |
| AM WEST |
delayed |
Phoenix |
415 |
7.897241 |
| AM WEST |
on time |
San Diego |
383 |
85.491071 |
| AM WEST |
delayed |
San Diego |
65 |
14.508929 |
| AM WEST |
on time |
San Francisco |
320 |
71.269488 |
| AM WEST |
delayed |
San Francisco |
129 |
28.730512 |
| AM WEST |
on time |
Seattle |
201 |
76.717557 |
| AM WEST |
delayed |
Seattle |
61 |
23.282443 |
#merged the Alaska and Am WEST tables together
merged_df <- left_join(alaska_df,amwest_df, by=c("Time_Indicator","city"), all = TRUE)
knitr::kable(merged_df, caption = "Merged Table")
Merged Table
| ALASKA |
on time |
Los Angeles |
497 |
88.908766 |
AM WEST |
694 |
85.573366 |
| ALASKA |
delayed |
Los Angeles |
62 |
11.091234 |
AM WEST |
117 |
14.426634 |
| ALASKA |
on time |
Phoenix |
221 |
94.849785 |
AM WEST |
4840 |
92.102759 |
| ALASKA |
delayed |
Phoenix |
12 |
5.150215 |
AM WEST |
415 |
7.897241 |
| ALASKA |
on time |
San Diego |
212 |
91.379310 |
AM WEST |
383 |
85.491071 |
| ALASKA |
delayed |
San Diego |
20 |
8.620690 |
AM WEST |
65 |
14.508929 |
| ALASKA |
on time |
San Francisco |
503 |
83.140496 |
AM WEST |
320 |
71.269488 |
| ALASKA |
delayed |
San Francisco |
102 |
16.859504 |
AM WEST |
129 |
28.730512 |
| ALASKA |
on time |
Seattle |
1841 |
85.787512 |
AM WEST |
201 |
76.717557 |
| ALASKA |
delayed |
Seattle |
305 |
14.212488 |
AM WEST |
61 |
23.282443 |
delayed_df <- (filter(merged_df, Time_Indicator== "delayed"))
delayed_df <- mutate(delayed_df ,
airline_of_choice = (percent_time_indicator.x -percent_time_indicator.y))
delayed_df <- mutate(delayed_df ,
airline_of_choice = if_else(airline_of_choice < 1, "ALASKA", "AM WEST"))
delayed_df <- tbl_df(delayed_df)
delayed_df <- select(delayed_df, city, percent_time_indicator.x ,percent_time_indicator.y , airline_of_choice )
names(delayed_df)<- c("City", "Alaska Delay %", "AM West Delay %", "Preferred Airline" )
knitr::kable(delayed_df, caption = "Delayed Table")
Delayed Table
| Los Angeles |
11.091234 |
14.426634 |
ALASKA |
| Phoenix |
5.150215 |
7.897241 |
ALASKA |
| San Diego |
8.620690 |
14.508929 |
ALASKA |
| San Francisco |
16.859504 |
28.730512 |
ALASKA |
| Seattle |
14.212488 |
23.282443 |
ALASKA |
ontime_df <- (filter(merged_df, Time_Indicator== "on time"))
ontime_df <- mutate(ontime_df ,
airline_of_choice = (percent_time_indicator.x -percent_time_indicator.y))
ontime_df <- mutate(ontime_df ,
airline_of_choice = if_else(airline_of_choice > 1, "ALASKA", "AM WEST"))
ontime_df <- tbl_df(ontime_df)
ontime_df <- select(ontime_df, city, percent_time_indicator.x ,percent_time_indicator.y , airline_of_choice )
names(ontime_df)<- c("City", "Alaska On Time %", "AM West On Time %", "Preferred Airline" )
knitr::kable(ontime_df, caption = "On Time Table")
On Time Table
| Los Angeles |
88.90877 |
85.57337 |
ALASKA |
| Phoenix |
94.84979 |
92.10276 |
ALASKA |
| San Diego |
91.37931 |
85.49107 |
ALASKA |
| San Francisco |
83.14050 |
71.26949 |
ALASKA |
| Seattle |
85.78751 |
76.71756 |
ALASKA |