arrival_data <- read_csv(file = 'https://raw.githubusercontent.com/pmahdi/cuny-data-607/main/assignment-4-arrival_data.csv')
arrival_data
## # A tibble: 5 × 7
## ...1 ...2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
Adding missing headers:
colnames(arrival_data)
## [1] "...1" "...2" "Los Angeles" "Phoenix"
## [5] "San Diego" "San Francisco" "Seattle"
colnames(arrival_data) <- c('airline', 'arrival_status', 'Los Angeles', 'Phoenix', 'San Diego', 'San Francisco', 'Seattle')
colnames(arrival_data)
## [1] "airline" "arrival_status" "Los Angeles" "Phoenix"
## [5] "San Diego" "San Francisco" "Seattle"
Dropping the empty 3rd row and imputing the missing values in
airline:
arrival_data <- arrival_data %>%
drop_na(arrival_status) %>%
fill(airline, .direction = 'down')
arrival_data
## # A tibble: 4 × 7
## airline arrival_status `Los Angeles` Phoenix `San Diego` San Francis…¹ Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
## # … with abbreviated variable name ¹`San Francisco`
Next, the data frame needs to be pivoted to the long format so that each column corresponds to a variable:
arrival_data <- arrival_data %>%
pivot_longer(
cols = -c(1:2),
names_to = 'airport',
values_to = 'flights'
)
arrival_data
## # A tibble: 20 × 4
## airline arrival_status airport flights
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San Diego 383
## 14 AM WEST on time San Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San Diego 65
## 19 AM WEST delayed San Francisco 129
## 20 AM WEST delayed Seattle 61
Finding the ratio of all on time flights to all delayed flights:
all_table <- arrival_data %>%
group_by(arrival_status) %>%
summarize(n_flights = sum(flights))
all_ratio <- all_table[1, 2] / all_table[2, 2]
Finding the ratio of on time flights to delayed flights per airline:
airlines_table <- arrival_data %>%
group_by(airline, arrival_status) %>%
summarize(n_flights = sum(flights))
alaska_ratio <- airlines_table[1, 3] / airlines_table[2, 3]
am_west_ratio <- airlines_table[3, 3] / airlines_table[4, 3]
Finding the ratio of on time flights to delayed flights per airport:
airports_table <- arrival_data %>%
group_by(airport, arrival_status) %>%
summarize(n_flights = sum(flights))
la_ratio <- airports_table[1, 3] / airports_table[2, 3]
px_ratio <- airports_table[3, 3] / airports_table[4, 3]
sd_ratio <- airports_table[5, 3] / airports_table[6, 3]
sf_ratio <- airports_table[7, 3] / airports_table[8, 3]
se_ratio <- airports_table[9, 3] / airports_table[10, 3]
Presenting all the ratios in a single data frame:
ratio_df <- tibble(flight_group = c('all_flights', 'alaska_airlines', 'am_west_airlines', 'los_angeles', 'phoenix', 'san_diego', 'san_francisco', 'seattle'), flight_ratio = rep(NA, length.out = 8))
ratio_df['flight_ratio'] <- rbind(all_ratio, alaska_ratio, am_west_ratio, la_ratio, px_ratio, sd_ratio, sf_ratio, se_ratio)
ratio_df
## # A tibble: 8 × 2
## flight_group flight_ratio
## <chr> <dbl>
## 1 all_flights 0.133
## 2 alaska_airlines 0.153
## 3 am_west_airlines 0.122
## 4 los_angeles 0.150
## 5 phoenix 0.0844
## 6 san_diego 0.143
## 7 san_francisco 0.281
## 8 seattle 0.179
Conclusion: Between the airlines, AM WEST comes out ahead in terms of having fewer delays. Meanwhile, Phoenix seems to be the destination to go to if one’s only objective is to escape the authorities as quickly as possible.