The data is saved on github. In this chunk I pull the data locally then run a few simple cleanup operations to get it ready for ‘tidyr’ and ‘dplyr’
# dowload file from github, save it locally in your home directory
download <- download.file('https://raw.githubusercontent.com/kelloggjohnd/DATA607/master/Week5.csv', destfile = "Week5.csv", method = "wininet")
# manipulate the data into a data frame
data_raw <- data.frame(read.csv(file = "Week5.csv", header = TRUE, sep = ","))
# set the names of the columns
names(data_raw)<- c("Flight", "Status", "Los.Angeles", "Phoenix", "San.Diego","San.Francisco" ,"Seattle")
# Set the to main data columns to Character type
data_raw$Flight <- as.character(data_raw$Flight)
data_raw$Status <- as.character(data_raw$Status)
# Change the pesky "On Time" to "on.time". R keeps looking for function "time"
data_raw <- data_raw %>% mutate(Status=recode(Status, "on time" = "on.time"))
# Raw dataframe
head(data_raw)## Flight Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on.time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on.time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
# add the Flight company to each row
data_raw [2,1] <- "ALASKA"
data_raw [5,1] <- "AM WEST"
# remove the blank row in the middle
data_raw <- data_raw [-3,]
head(data_raw)## Flight Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on.time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on.time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
## Flight Status City flight_length
## 1 ALASKA on.time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on.time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on.time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on.time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on.time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on.time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on.time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on.time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on.time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on.time Seattle 201
## 20 AM WEST delayed Seattle 61
## Flight City delayed on.time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
## Status flight_length
## 1 on.time 497
## 2 delayed 62
## 3 on.time 694
## 4 delayed 117
## 5 on.time 221
## 6 delayed 12
## 7 on.time 4840
## 8 delayed 415
## 9 on.time 212
## 10 delayed 20
## 11 on.time 383
## 12 delayed 65
## 13 on.time 503
## 14 delayed 102
## 15 on.time 320
## 16 delayed 129
## 17 on.time 1841
## 18 delayed 305
## 19 on.time 201
## 20 delayed 61
## Flight Status flight_length
## 1 ALASKA on.time 497
## 2 ALASKA delayed 62
## 3 AM WEST on.time 694
## 4 AM WEST delayed 117
## 5 ALASKA on.time 221
## 6 ALASKA delayed 12
## 7 AM WEST on.time 4840
## 8 AM WEST delayed 415
## 9 ALASKA on.time 212
## 10 ALASKA delayed 20
## 11 AM WEST on.time 383
## 12 AM WEST delayed 65
## 13 ALASKA on.time 503
## 14 ALASKA delayed 102
## 15 AM WEST on.time 320
## 16 AM WEST delayed 129
## 17 ALASKA on.time 1841
## 18 ALASKA delayed 305
## 19 AM WEST on.time 201
## 20 AM WEST delayed 61
# Logic: Alaska and On Time
alaska_ontime <- filter(project1, Flight == "ALASKA", Status == "on.time")
alaska_ontime## Flight Status City flight_length
## 1 ALASKA on.time Los.Angeles 497
## 2 ALASKA on.time Phoenix 221
## 3 ALASKA on.time San.Diego 212
## 4 ALASKA on.time San.Francisco 503
## 5 ALASKA on.time Seattle 1841
# Logic: Alaska and Delayed
alaska_delayed<- filter(project1, Flight == "ALASKA", Status == "delayed")
alaska_delayed## Flight Status City flight_length
## 1 ALASKA delayed Los.Angeles 62
## 2 ALASKA delayed Phoenix 12
## 3 ALASKA delayed San.Diego 20
## 4 ALASKA delayed San.Francisco 102
## 5 ALASKA delayed Seattle 305
# Logic: AM WEST and On Time
amwest_ontime <- filter(project1, Flight == "AM WEST", Status == "on.time")
amwest_ontime## Flight Status City flight_length
## 1 AM WEST on.time Los.Angeles 694
## 2 AM WEST on.time Phoenix 4840
## 3 AM WEST on.time San.Diego 383
## 4 AM WEST on.time San.Francisco 320
## 5 AM WEST on.time Seattle 201
# Logic: AM WEST and Delayed
amwest_delayed<- filter(project1, Flight == "AM WEST", Status == "delayed")
amwest_delayed## Flight Status City flight_length
## 1 AM WEST delayed Los.Angeles 117
## 2 AM WEST delayed Phoenix 415
## 3 AM WEST delayed San.Diego 65
## 4 AM WEST delayed San.Francisco 129
## 5 AM WEST delayed Seattle 61
## Flight City delayed on.time total
## 1 ALASKA Los.Angeles 62 497 559
## 2 ALASKA Phoenix 12 221 233
## 3 ALASKA San.Diego 20 212 232
## 4 ALASKA San.Francisco 102 503 605
## 5 ALASKA Seattle 305 1841 2146
## 6 AM WEST Los.Angeles 117 694 811
## 7 AM WEST Phoenix 415 4840 5255
## 8 AM WEST San.Diego 65 383 448
## 9 AM WEST San.Francisco 129 320 449
## 10 AM WEST Seattle 61 201 262
## Flight City delayed on.time total
## 1 AM WEST Phoenix 415 4840 5255
## 2 ALASKA Seattle 305 1841 2146
## 3 AM WEST Los.Angeles 117 694 811
## 4 ALASKA San.Francisco 102 503 605
## 5 ALASKA Los.Angeles 62 497 559
## 6 AM WEST San.Francisco 129 320 449
## 7 AM WEST San.Diego 65 383 448
## 8 AM WEST Seattle 61 201 262
## 9 ALASKA Phoenix 12 221 233
## 10 ALASKA San.Diego 20 212 232
compare1<- project2 %>%
group_by(Flight)%>%
summarise(median = median(total), max = max(total), min(total))
compare1## # A tibble: 2 x 4
## Flight median max `min(total)`
## <chr> <int> <int> <int>
## 1 ALASKA 559 2146 232
## 2 AM WEST 449 5255 262
Assumption: The amount of flights and delays are normally distributed
Logic: Probability your flight will be delayed to each city
# defining the percentage of delayed flights for each city
project2 <- mutate(project2, delay_ratio = delayed/total)
project2## Flight City delayed on.time total delay_ratio
## 1 AM WEST Phoenix 415 4840 5255 0.07897241
## 2 ALASKA Seattle 305 1841 2146 0.14212488
## 3 AM WEST Los.Angeles 117 694 811 0.14426634
## 4 ALASKA San.Francisco 102 503 605 0.16859504
## 5 ALASKA Los.Angeles 62 497 559 0.11091234
## 6 AM WEST San.Francisco 129 320 449 0.28730512
## 7 AM WEST San.Diego 65 383 448 0.14508929
## 8 AM WEST Seattle 61 201 262 0.23282443
## 9 ALASKA Phoenix 12 221 233 0.05150215
## 10 ALASKA San.Diego 20 212 232 0.08620690
# Identifying the Mean and the Standard deviation for both airlines.
analysis <-
project2 %>%
group_by(Flight)%>%
summarise(Ttl = sum(total), ONTIME.mean = mean(on.time, na.rm = TRUE),DELAY.mean = mean(delayed, na.rm = TRUE), M.Ratio = (DELAY.mean/ONTIME.mean),
ONTIME.SD = sd(on.time, na.rm = TRUE), DELAY.SD = sd(delayed, na.rm = TRUE), SD.Ratio = (DELAY.SD/ONTIME.SD))
analysis## # A tibble: 2 x 8
## Flight Ttl ONTIME.mean DELAY.mean M.Ratio ONTIME.SD DELAY.SD SD.Ratio
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA 3775 655. 100. 0.153 678. 120. 0.177
## 2 AM WEST 7225 1288. 157. 0.122 1994. 147. 0.0738
First chunk has echo=TRUE, all others in this section for readability have echo = FALSE
if_else(
(project2 %>%
filter(Flight == "ALASKA", City == "Los.Angeles") %>%
select(delay_ratio))<
(project2 %>%
filter(Flight == "AM WEST", City == "Los.Angeles") %>%
select(delay_ratio)), "ALASKA", "AM WEST")## [1] "ALASKA"
delay_alaska <- project2 %>% filter (Flight == "ALASKA", City == "Los.Angeles") %>% select(delay_ratio)
delay_AMWEST <- project2 %>% filter (Flight == "AM WEST", City == "Los.Angeles") %>% select(delay_ratio)
print(paste0("Alaska has a delay ratio of ", sprintf(delay_alaska, fmt = '%#.4f'), " and AM WEST a delay ratio of ", sprintf(delay_AMWEST, fmt = '%#.4f')))## [1] "Alaska has a delay ratio of 0.1109 and AM WEST a delay ratio of 0.1443"
## [1] "ALASKA"
## [1] "Alaska has a delay ratio of 0.0515 and AM WEST a delay ratio of 0.0790"
## [1] "ALASKA"
## [1] "Alaska has a delay ratio of 0.0862 and AM WEST a delay ratio of 0.1451"
## [1] "ALASKA"
## [1] "Alaska has a delay ratio of 0.1686 and AM WEST a delay ratio of 0.2873"
## [1] "ALASKA"
## [1] "Alaska has a delay ratio of 0.1421 and AM WEST a delay ratio of 0.2328"
If you look at the Analysis table only for your choice of the better airline, AM WEST seems to be the better choice. Their ratio of mean On Time vs. delay is less than Alaska. AM West does have 26% more flights in total than Alaska allowing for a larger chance of delays. Using just this table is inconclusive.
Analyzing each individual flight destination for probability of less delays, Alaska has the advantage taking each of the individual flights by having a smaller delay ratio.
If I was going to chose a single carrier to use and didn’t mind a smaller pool of available flights I would chose: