domestic<- read.csv("C:/Users/joyof/Desktop/JOY OFIELU MBA 676/domestic_flights_jan_2016.csv")
head(domestic)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName
## 1 1/6/2016 AA N4YBAA 43 DFW Dallas/Fort Worth, TX
## 2 1/7/2016 AA N434AA 43 DFW Dallas/Fort Worth, TX
## 3 1/8/2016 AA N541AA 43 DFW Dallas/Fort Worth, TX
## 4 1/9/2016 AA N489AA 43 DFW Dallas/Fort Worth, TX
## 5 1/10/2016 AA N439AA 43 DFW Dallas/Fort Worth, TX
## 6 1/11/2016 AA N468AA 43 DFW Dallas/Fort Worth, TX
## OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1 TX DTW Detroit, MI MI 1100 1057 1112
## 2 TX DTW Detroit, MI MI 1100 1056 1110
## 3 TX DTW Detroit, MI MI 1100 1055 1116
## 4 TX DTW Detroit, MI MI 1100 1102 1115
## 5 TX DTW Detroit, MI MI 1100 1240 1300
## 6 TX DTW Detroit, MI MI 1100 1107 1118
## WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1 1424 1438 1432 0 0 158
## 2 1416 1438 1426 0 0 158
## 3 1431 1438 1445 0 0 158
## 4 1424 1438 1433 0 0 158
## 5 1617 1438 1631 0 0 158
## 6 1426 1438 1435 0 0 158
## ActualElapsedTime Distance
## 1 155 986
## 2 150 986
## 3 170 986
## 4 151 986
## 5 171 986
## 6 148 986
library(dplyr)
head(domestic %>% filter(!complete.cases(.)))
## FlightDate Carrier TailNum FlightNum Origin OriginCityName OriginState
## 1 1/16/2016 AA N3CXAA 44 SEA Seattle, WA WA
## 2 1/23/2016 AA 44 SEA Seattle, WA WA
## 3 1/24/2016 AA N3DGAA 44 SEA Seattle, WA WA
## 4 1/25/2016 AA N3MNAA 44 SEA Seattle, WA WA
## 5 1/15/2016 AA N3JSAA 45 JFK New York, NY NY
## 6 1/23/2016 AA 45 JFK New York, NY NY
## Dest DestCityName DestState CRSDepTime DepTime WheelsOff WheelsOn
## 1 JFK New York, NY NY 640 NA NA NA
## 2 JFK New York, NY NY 640 NA NA NA
## 3 JFK New York, NY NY 645 NA NA NA
## 4 JFK New York, NY NY 645 NA NA NA
## 5 SEA Seattle, WA WA 1830 NA NA NA
## 6 SEA Seattle, WA WA 1830 NA NA NA
## CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime
## 1 1501 NA 1 0 321 NA
## 2 1501 NA 1 0 321 NA
## 3 1506 NA 1 0 321 NA
## 4 1506 NA 1 0 321 NA
## 5 2152 NA 1 0 382 NA
## 6 2152 NA 1 0 382 NA
## Distance
## 1 2422
## 2 2422
## 3 2422
## 4 2422
## 5 2422
## 6 2422
domestic$FlightDate <- as.Date(domestic$FlightDate, format = "%m/%d/%Y")
domestic%>% filter(Cancelled == 0) %>% summarize(maxcrsdep = max(CRSDepTime), maxdep = max(DepTime))
## maxcrsdep maxdep
## 1 2359 2400
domestic <- domestic %>%
mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)))
domestic$new_CRSDepTime <- as.POSIXct(domestic$new_CRSDepTime, format="%Y-%m-%d %H%M")
head(domestic %>% select(new_CRSDepTime, CRSDepTime))
## new_CRSDepTime CRSDepTime
## 1 2016-01-06 11:00:00 1100
## 2 2016-01-07 11:00:00 1100
## 3 2016-01-08 11:00:00 1100
## 4 2016-01-09 11:00:00 1100
## 5 2016-01-10 11:00:00 1100
## 6 2016-01-11 11:00:00 1100
domestic <- domestic %>%
mutate(new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
domestic$new_CRSArrTime <- as.POSIXct(domestic$new_CRSArrTime, format="%Y-%m-%d %H%M")
domestic <- domestic %>% filter(Cancelled == 0) %>%
mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)),
new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)), new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)))
domestic$new_DepTime <- as.POSIXct(domestic$new_DepTime, format="%Y-%m-%d %H%M")
domestic$new_WheelsOff <- as.POSIXct(domestic$new_WheelsOff, format="%Y-%m-%d %H%M")
domestic$new_WheelsOn <- as.POSIXct(domestic$new_WheelsOn, format="%Y-%m-%d %H%M")
domestic$new_ArrTime <- as.POSIXct(domestic$new_ArrTime, format="%Y-%m-%d %H%M")
head(domestic)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName
## 1 2016-01-06 AA N4YBAA 43 DFW Dallas/Fort Worth, TX
## 2 2016-01-07 AA N434AA 43 DFW Dallas/Fort Worth, TX
## 3 2016-01-08 AA N541AA 43 DFW Dallas/Fort Worth, TX
## 4 2016-01-09 AA N489AA 43 DFW Dallas/Fort Worth, TX
## 5 2016-01-10 AA N439AA 43 DFW Dallas/Fort Worth, TX
## 6 2016-01-11 AA N468AA 43 DFW Dallas/Fort Worth, TX
## OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1 TX DTW Detroit, MI MI 1100 1057 1112
## 2 TX DTW Detroit, MI MI 1100 1056 1110
## 3 TX DTW Detroit, MI MI 1100 1055 1116
## 4 TX DTW Detroit, MI MI 1100 1102 1115
## 5 TX DTW Detroit, MI MI 1100 1240 1300
## 6 TX DTW Detroit, MI MI 1100 1107 1118
## WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1 1424 1438 1432 0 0 158
## 2 1416 1438 1426 0 0 158
## 3 1431 1438 1445 0 0 158
## 4 1424 1438 1433 0 0 158
## 5 1617 1438 1631 0 0 158
## 6 1426 1438 1435 0 0 158
## ActualElapsedTime Distance new_CRSDepTime new_CRSArrTime
## 1 155 986 2016-01-06 11:00:00 2016-01-06 14:38:00
## 2 150 986 2016-01-07 11:00:00 2016-01-07 14:38:00
## 3 170 986 2016-01-08 11:00:00 2016-01-08 14:38:00
## 4 151 986 2016-01-09 11:00:00 2016-01-09 14:38:00
## 5 171 986 2016-01-10 11:00:00 2016-01-10 14:38:00
## 6 148 986 2016-01-11 11:00:00 2016-01-11 14:38:00
## new_DepTime new_WheelsOff new_WheelsOn
## 1 2016-01-06 10:57:00 2016-01-06 11:12:00 2016-01-06 14:24:00
## 2 2016-01-07 10:56:00 2016-01-07 11:10:00 2016-01-07 14:16:00
## 3 2016-01-08 10:55:00 2016-01-08 11:16:00 2016-01-08 14:31:00
## 4 2016-01-09 11:02:00 2016-01-09 11:15:00 2016-01-09 14:24:00
## 5 2016-01-10 12:40:00 2016-01-10 13:00:00 2016-01-10 16:17:00
## 6 2016-01-11 11:07:00 2016-01-11 11:18:00 2016-01-11 14:26:00
## new_ArrTime
## 1 2016-01-06 14:32:00
## 2 2016-01-07 14:26:00
## 3 2016-01-08 14:45:00
## 4 2016-01-09 14:33:00
## 5 2016-01-10 16:31:00
## 6 2016-01-11 14:35:00
domestic <-domestic %>% filter(Cancelled == 0) %>%
mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")))
head(domestic %>% select(CRSDepTime, DepTime, DepDelay))
## CRSDepTime DepTime DepDelay
## 1 1100 1057 -3
## 2 1100 1056 -4
## 3 1100 1055 -5
## 4 1100 1102 2
## 5 1100 1240 100
## 6 1100 1107 7
domestic <- domestic %>% filter(Cancelled == 0) %>%
mutate(DepDelayMinutes = ifelse(DepDelay < 0, 0, DepDelay),
DepDel15 = ifelse(DepDelay >= 15, 1, 0))
head(domestic %>% select(DepDelay, DepDelayMinutes, DepDel15))
## DepDelay DepDelayMinutes DepDel15
## 1 -3 0 0
## 2 -4 0 0
## 3 -5 0 0
## 4 2 2 0
## 5 100 100 1
## 6 7 7 0
domestic <- domestic %>% filter(Cancelled == 0) %>%
mutate(TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")),
TaxiIn = as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins")),
ArrDelay = as.integer(difftime(new_ArrTime, new_CRSArrTime, units = "mins")),
ArrDelayMinutes = ifelse(ArrDelay < 0, 0, ArrDelay),
ArrDel15 = ifelse(ArrDelay >= 15, 1, 0),
FlightTimeBuffer = CRSElapsedTime - ActualElapsedTime)
domestic <- domestic %>% filter(Cancelled == 0) %>%
mutate(AirTime = ActualElapsedTime - TaxiOut - TaxiIn)
domestic <- domestic %>% filter(Cancelled == 0) %>%
mutate(AirSpeed = Distance / (AirTime / 60))
head(domestic)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName
## 1 2016-01-06 AA N4YBAA 43 DFW Dallas/Fort Worth, TX
## 2 2016-01-07 AA N434AA 43 DFW Dallas/Fort Worth, TX
## 3 2016-01-08 AA N541AA 43 DFW Dallas/Fort Worth, TX
## 4 2016-01-09 AA N489AA 43 DFW Dallas/Fort Worth, TX
## 5 2016-01-10 AA N439AA 43 DFW Dallas/Fort Worth, TX
## 6 2016-01-11 AA N468AA 43 DFW Dallas/Fort Worth, TX
## OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1 TX DTW Detroit, MI MI 1100 1057 1112
## 2 TX DTW Detroit, MI MI 1100 1056 1110
## 3 TX DTW Detroit, MI MI 1100 1055 1116
## 4 TX DTW Detroit, MI MI 1100 1102 1115
## 5 TX DTW Detroit, MI MI 1100 1240 1300
## 6 TX DTW Detroit, MI MI 1100 1107 1118
## WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1 1424 1438 1432 0 0 158
## 2 1416 1438 1426 0 0 158
## 3 1431 1438 1445 0 0 158
## 4 1424 1438 1433 0 0 158
## 5 1617 1438 1631 0 0 158
## 6 1426 1438 1435 0 0 158
## ActualElapsedTime Distance new_CRSDepTime new_CRSArrTime
## 1 155 986 2016-01-06 11:00:00 2016-01-06 14:38:00
## 2 150 986 2016-01-07 11:00:00 2016-01-07 14:38:00
## 3 170 986 2016-01-08 11:00:00 2016-01-08 14:38:00
## 4 151 986 2016-01-09 11:00:00 2016-01-09 14:38:00
## 5 171 986 2016-01-10 11:00:00 2016-01-10 14:38:00
## 6 148 986 2016-01-11 11:00:00 2016-01-11 14:38:00
## new_DepTime new_WheelsOff new_WheelsOn
## 1 2016-01-06 10:57:00 2016-01-06 11:12:00 2016-01-06 14:24:00
## 2 2016-01-07 10:56:00 2016-01-07 11:10:00 2016-01-07 14:16:00
## 3 2016-01-08 10:55:00 2016-01-08 11:16:00 2016-01-08 14:31:00
## 4 2016-01-09 11:02:00 2016-01-09 11:15:00 2016-01-09 14:24:00
## 5 2016-01-10 12:40:00 2016-01-10 13:00:00 2016-01-10 16:17:00
## 6 2016-01-11 11:07:00 2016-01-11 11:18:00 2016-01-11 14:26:00
## new_ArrTime DepDelay DepDelayMinutes DepDel15 TaxiOut TaxiIn
## 1 2016-01-06 14:32:00 -3 0 0 15 8
## 2 2016-01-07 14:26:00 -4 0 0 14 10
## 3 2016-01-08 14:45:00 -5 0 0 21 14
## 4 2016-01-09 14:33:00 2 2 0 13 9
## 5 2016-01-10 16:31:00 100 100 1 20 14
## 6 2016-01-11 14:35:00 7 7 0 11 9
## ArrDelay ArrDelayMinutes ArrDel15 FlightTimeBuffer AirTime AirSpeed
## 1 -6 0 0 3 132 448.1818
## 2 -12 0 0 8 126 469.5238
## 3 7 7 0 -12 135 438.2222
## 4 -5 0 0 7 129 458.6047
## 5 113 113 1 -13 137 431.8248
## 6 -3 0 0 10 128 462.1875
domestic2 <- domestic %>% filter(Cancelled == 0)
library(ggplot2)
library(lubridate)
q1 <- domestic2 %>% group_by(day = wday(FlightDate, label=TRUE)) %>%
mutate(delayed = ifelse(DepDelay > 0, 1, 0)) %>%
summarize(perc_delay = sum(delayed) / n())
q1 %>% ggplot(aes(x = day, y = perc_delay)) + geom_col()
domestic2 %>% mutate(delayed = ifelse(DepDelay > 0, 1, 0), time_part = ifelse(CRSDepTime > 1200, "Afternoon/Evening", "Morning")) %>%
group_by(time_part) %>%
summarize(perc_delay = sum(delayed) / n())
## # A tibble: 2 x 2
## time_part perc_delay
## <chr> <dbl>
## 1 Afternoon/Evening 0.397
## 2 Morning 0.262
library(chron)
domestic2%>% group_by(is.weekend(FlightDate)) %>%
mutate(delayed = ifelse(DepDelay > 0, 1, 0)) %>%
summarize(perc_delay = sum(delayed) / n())
## # A tibble: 2 x 2
## `is.weekend(FlightDate)` perc_delay
## <lgl> <dbl>
## 1 FALSE 0.335
## 2 TRUE 0.353
domestic2 %>% group_by(day = wday(FlightDate, label=TRUE)) %>% filter(ArrDelayMinutes > 0) %>% count(ArrDelayMinutes > 0)
## # A tibble: 7 x 3
## # Groups: day [7]
## day `ArrDelayMinutes > 0` n
## <ord> <lgl> <int>
## 1 Sun TRUE 23324
## 2 Mon TRUE 20702
## 3 Tue TRUE 17541
## 4 Wed TRUE 17111
## 5 Thu TRUE 19014
## 6 Fri TRUE 25110
## 7 Sat TRUE 17001
# Total Number of delays by day for January
q3 <- domestic %>%
group_by(day = wday(FlightDate, label=TRUE)) %>% select(day, ArrDelayMinutes) %>%
filter(ArrDelayMinutes > 0) %>%
summarise(delaysByDay = n())
q3
## # A tibble: 7 x 2
## day delaysByDay
## <ord> <int>
## 1 Sun 23324
## 2 Mon 20702
## 3 Tue 17541
## 4 Wed 17111
## 5 Thu 19014
## 6 Fri 25110
## 7 Sat 17001
# Number of flights delayed by carrier
q4 <- domestic %>%
filter(ArrDelayMinutes > 0) %>%
group_by(Carrier) %>%
summarise(delaysByCarrier = n())
q4
## # A tibble: 12 x 2
## Carrier delaysByCarrier
## <fct> <int>
## 1 AA 24914
## 2 AS 3947
## 3 B6 9695
## 4 DL 18768
## 5 EV 12426
## 6 F9 1723
## 7 HA 1892
## 8 NK 5270
## 9 OO 17513
## 10 UA 9766
## 11 VX 2059
## 12 WN 31830
# Number of flights delayed by state
q5 <- domestic2%>%
group_by(OriginState) %>%
filter(ArrDelayMinutes > 0) %>%
count(ArrDelayMinutes > 0)
q5
## # A tibble: 52 x 3
## # Groups: OriginState [52]
## OriginState `ArrDelayMinutes > 0` n
## <fct> <lgl> <int>
## 1 AK TRUE 667
## 2 AL TRUE 508
## 3 AR TRUE 306
## 4 AZ TRUE 5438
## 5 CA TRUE 20986
## 6 CO TRUE 5926
## 7 CT TRUE 464
## 8 FL TRUE 12764
## 9 GA TRUE 8791
## 10 HI TRUE 1917
## # ... with 42 more rows
# Mean average air speed per air carrier when a flight is delayed
q6 <- domestic2 %>% filter(ArrDelayMinutes > 0) %>%
group_by(Carrier) %>%
summarize(MeanAirSpeed = mean(AirSpeed, na.rm = TRUE))
q6
## # A tibble: 12 x 2
## Carrier MeanAirSpeed
## <fct> <dbl>
## 1 AA 413.
## 2 AS 420.
## 3 B6 414.
## 4 DL 407.
## 5 EV 356.
## 6 F9 434.
## 7 HA 338.
## 8 NK 424.
## 9 OO 361.
## 10 UA 437.
## 11 VX 420.
## 12 WN 405.
# Mean average air speed per air carrier when a flight is not delayed
q7 <- domestic2 %>% filter(ArrDelayMinutes == 0) %>%
group_by(Carrier) %>%
summarize(MeanAirSpeedNoDelay = mean(AirSpeed, na.rm = TRUE))
q7
## # A tibble: 12 x 2
## Carrier MeanAirSpeedNoDelay
## <fct> <dbl>
## 1 AA 421.
## 2 AS 430.
## 3 B6 413.
## 4 DL 416.
## 5 EV 360.
## 6 F9 439.
## 7 HA 346.
## 8 NK 435.
## 9 OO 372.
## 10 UA 442.
## 11 VX 442.
## 12 WN 416.
air_speed_query_results <- merge(q6, q7)
library(ggvis)
q4 %>%
ggvis(x =~Carrier, y =~ delaysByCarrier) %>%
layer_bars()
#add_axis("x", title="Carriers") %>%
#add_axis("y", title="Mean AirSpeed”)
Looking at the barplot we can see that there is maximun delay in WN,followed by AA and the minimum is F9
library(ggvis)
air_speed_query_results %>%
ggvis(x =~Carrier, y =~ MeanAirSpeed) %>%
layer_points(stroke:="red") %>%
layer_points(x=~Carrier, y=~MeanAirSpeedNoDelay, stroke:= "blue") %>%
add_axis("x", title="Carriers") %>%
add_axis("y", title="Mean Air Speed")
looking at the speed we can see that UA and VX have the highest speedand HA have the min.speed.