flights <- read.csv (file = '/Users/aaronzalki/flights.csv', sep=",",header = TRUE)
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on '/Users/
## aaronzalki/flights.csv'
colnames(flights)<-c("Airline","Status","Los Angeles","Phoenix","San Diego","San Francisco", "Seattle")
flights
## Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
flights$Airline[flights$Airline==""] <- NA
flights$Airline<- na.locf(flights$Airline,option="locf")
autofill_flights <- flights
autofill_flights
## Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Columns 3,4,5,6, and 7 in the original data are the Destination Cities. Using tidyr and dplyr packages, I will stack the columns (3 to 7) data, so that they appear as values under the new column header ‘Destination’. The original numeric values will appear under the new column header ‘Frequency’. I also arranged the data by Flight Status (On Time vs. Delayed)
library ('tidyr')
library ('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
gather_flights <- gather (autofill_flights, "Destination", "Frequency", 3:7)
arranged_by_status <- arrange (gather_flights, Status)
arranged_by_status
## Airline Status Destination Frequency
## 1 ALASKA delayed Los Angeles 62
## 2 AM WEST delayed Los Angeles 117
## 3 ALASKA delayed Phoenix 12
## 4 AM WEST delayed Phoenix 415
## 5 ALASKA delayed San Diego 20
## 6 AM WEST delayed San Diego 65
## 7 ALASKA delayed San Francisco 102
## 8 AM WEST delayed San Francisco 129
## 9 ALASKA delayed Seattle 305
## 10 AM WEST delayed Seattle 61
## 11 ALASKA on time Los Angeles 497
## 12 AM WEST on time Los Angeles 694
## 13 ALASKA on time Phoenix 221
## 14 AM WEST on time Phoenix 4840
## 15 ALASKA on time San Diego 212
## 16 AM WEST on time San Diego 383
## 17 ALASKA on time San Francisco 503
## 18 AM WEST on time San Francisco 320
## 19 ALASKA on time Seattle 1841
## 20 AM WEST on time Seattle 201
arranged_by_status %>%
group_by(Airline) %>%
filter(Status == "delayed") %>%
summarise (mean = mean (Frequency), min = min (Frequency), max = max (Frequency), median = median (Frequency), sd = sd (Frequency),sum = sum (Frequency))
## # A tibble: 2 x 7
## Airline mean min max median sd sum
## <fct> <dbl> <int> <int> <int> <dbl> <int>
## 1 ALASKA 100. 12 305 62 120. 501
## 2 AM WEST 157. 61 415 117 147. 787
mutate_arranged_by_status <- arranged_by_status %>%
group_by(Airline, Destination) %>%
arrange(Airline) %>%
mutate(Flights_City = sum(Frequency),
Ratio_Flights_Per_City = Frequency / Flights_City)
mutate_arranged_by_status
## # A tibble: 20 x 6
## # Groups: Airline, Destination [10]
## Airline Status Destination Frequency Flights_City Ratio_Flights_Per_C…
## <fct> <fct> <chr> <int> <int> <dbl>
## 1 ALASKA delayed Los Angeles 62 559 0.111
## 2 ALASKA delayed Phoenix 12 233 0.0515
## 3 ALASKA delayed San Diego 20 232 0.0862
## 4 ALASKA delayed San Francis… 102 605 0.169
## 5 ALASKA delayed Seattle 305 2146 0.142
## 6 ALASKA on time Los Angeles 497 559 0.889
## 7 ALASKA on time Phoenix 221 233 0.948
## 8 ALASKA on time San Diego 212 232 0.914
## 9 ALASKA on time San Francis… 503 605 0.831
## 10 ALASKA on time Seattle 1841 2146 0.858
## 11 AM WEST delayed Los Angeles 117 811 0.144
## 12 AM WEST delayed Phoenix 415 5255 0.0790
## 13 AM WEST delayed San Diego 65 448 0.145
## 14 AM WEST delayed San Francis… 129 449 0.287
## 15 AM WEST delayed Seattle 61 262 0.233
## 16 AM WEST on time Los Angeles 694 811 0.856
## 17 AM WEST on time Phoenix 4840 5255 0.921
## 18 AM WEST on time San Diego 383 448 0.855
## 19 AM WEST on time San Francis… 320 449 0.713
## 20 AM WEST on time Seattle 201 262 0.767
mutate_arranged_by_status %>%
group_by(Airline) %>%
filter (Status == "delayed") %>%
summarise(mean = mean (Ratio_Flights_Per_City), min = min (Ratio_Flights_Per_City), max = max (Ratio_Flights_Per_City), median = median (Ratio_Flights_Per_City),sd = sd (Ratio_Flights_Per_City))
## # A tibble: 2 x 6
## Airline mean min max median sd
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA 0.112 0.0515 0.169 0.111 0.0459
## 2 AM WEST 0.178 0.0790 0.287 0.145 0.0821
The ratio summary statistics further add to the argument that AM WEST has more delays than ALASKA Airlines with all values being higher.