library(knitr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
flights <- read.csv("Sample.csv", header = TRUE, stringsAsFactors = FALSE)
head(flights)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName OriginState
## 1 1/14/2016 OO N8942A 4512 MSP Minneapolis, MN MN
## 2 1/7/2016 WN N663SW 145 ABQ Albuquerque, NM NM
## 3 1/17/2016 DL N942AT 1118 MIA Miami, FL FL
## 4 1/16/2016 WN N221WN 2419 IND Indianapolis, IN IN
## 5 1/25/2016 DL N949DL 1364 ATL Atlanta, GA GA
## 6 1/10/2016 AS N408AS 21 ORD Chicago, IL IL
## Dest DestCityName DestState CRSDepTime DepTime WheelsOff WheelsOn
## 1 FAR Fargo, ND ND 1354 1355 1412 1455
## 2 DAL Dallas, TX TX 555 604 623 842
## 3 LGA New York, NY NY 1301 1315 1342 1606
## 4 DAL Dallas, TX TX 1405 1403 1426 1525
## 5 MCI Kansas City, MO MO 1630 1629 1643 1726
## 6 SEA Seattle, WA WA 1940 1933 1956 2148
## CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime
## 1 1505 1501 0 0 71 66
## 2 840 846 0 0 105 102
## 3 1605 1616 0 0 184 181
## 4 1525 1529 0 0 140 146
## 5 1751 1729 0 0 141 120
## 6 2220 2156 0 0 280 263
## Distance
## 1 223
## 2 580
## 3 1096
## 4 755
## 5 692
## 6 1721
flights$FlightDate <- as.Date(flights$FlightDate, format = "%m/%d/%Y")
flights <- flights %>%
mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)))
flights$new_CRSDepTime <- as.POSIXct(flights$new_CRSDepTime, format="%Y-%m-%d %H%M")
head(flights %>% select(new_CRSDepTime, CRSDepTime))
## new_CRSDepTime CRSDepTime
## 1 2016-01-14 13:54:00 1354
## 2 2016-01-07 05:55:00 555
## 3 2016-01-17 13:01:00 1301
## 4 2016-01-16 14:05:00 1405
## 5 2016-01-25 16:30:00 1630
## 6 2016-01-10 19:40:00 1940
flights <- flights %>%
mutate(new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
flights$new_CRSArrTime <- as.POSIXct(flights$new_CRSArrTime, format="%Y-%m-%d %H%M")
head(flights %>% select(new_CRSArrTime, CRSArrTime))
## new_CRSArrTime CRSArrTime
## 1 2016-01-14 15:05:00 1505
## 2 2016-01-07 08:40:00 840
## 3 2016-01-17 16:05:00 1605
## 4 2016-01-16 15:25:00 1525
## 5 2016-01-25 17:51:00 1751
## 6 2016-01-10 22:20:00 2220
flights <-flights %>% filter(Cancelled == 0) %>% mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)), new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)), new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)))
flights$new_DepTime <- as.POSIXct(flights$new_DepTime, format="%Y-%m-%d %H%M")
flights$new_ArrTime <- as.POSIXct(flights$new_ArrTime, format="%Y-%m-%d %H%M")
flights$new_WheelsOff <- as.POSIXct(flights$new_WheelsOff, format="%Y-%m-%d %H%M")
flights$new_WheelsOn <- as.POSIXct(flights$new_WheelsOn, format ="%Y-%m-%d %H%M")
head(flights)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName OriginState
## 1 2016-01-14 OO N8942A 4512 MSP Minneapolis, MN MN
## 2 2016-01-07 WN N663SW 145 ABQ Albuquerque, NM NM
## 3 2016-01-17 DL N942AT 1118 MIA Miami, FL FL
## 4 2016-01-16 WN N221WN 2419 IND Indianapolis, IN IN
## 5 2016-01-25 DL N949DL 1364 ATL Atlanta, GA GA
## 6 2016-01-10 AS N408AS 21 ORD Chicago, IL IL
## Dest DestCityName DestState CRSDepTime DepTime WheelsOff WheelsOn
## 1 FAR Fargo, ND ND 1354 1355 1412 1455
## 2 DAL Dallas, TX TX 555 604 623 842
## 3 LGA New York, NY NY 1301 1315 1342 1606
## 4 DAL Dallas, TX TX 1405 1403 1426 1525
## 5 MCI Kansas City, MO MO 1630 1629 1643 1726
## 6 SEA Seattle, WA WA 1940 1933 1956 2148
## CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime
## 1 1505 1501 0 0 71 66
## 2 840 846 0 0 105 102
## 3 1605 1616 0 0 184 181
## 4 1525 1529 0 0 140 146
## 5 1751 1729 0 0 141 120
## 6 2220 2156 0 0 280 263
## Distance new_CRSDepTime new_CRSArrTime new_DepTime
## 1 223 2016-01-14 13:54:00 2016-01-14 15:05:00 2016-01-14 13:55:00
## 2 580 2016-01-07 05:55:00 2016-01-07 08:40:00 2016-01-07 06:04:00
## 3 1096 2016-01-17 13:01:00 2016-01-17 16:05:00 2016-01-17 13:15:00
## 4 755 2016-01-16 14:05:00 2016-01-16 15:25:00 2016-01-16 14:03:00
## 5 692 2016-01-25 16:30:00 2016-01-25 17:51:00 2016-01-25 16:29:00
## 6 1721 2016-01-10 19:40:00 2016-01-10 22:20:00 2016-01-10 19:33:00
## new_ArrTime new_WheelsOff new_WheelsOn
## 1 2016-01-14 15:01:00 2016-01-14 14:12:00 2016-01-14 14:55:00
## 2 2016-01-07 08:46:00 2016-01-07 06:23:00 2016-01-07 08:42:00
## 3 2016-01-17 16:16:00 2016-01-17 13:42:00 2016-01-17 16:06:00
## 4 2016-01-16 15:29:00 2016-01-16 14:26:00 2016-01-16 15:25:00
## 5 2016-01-25 17:29:00 2016-01-25 16:43:00 2016-01-25 17:26:00
## 6 2016-01-10 21:56:00 2016-01-10 19:56:00 2016-01-10 21:48:00
str(flights)
## 'data.frame': 484 obs. of 27 variables:
## $ FlightDate : Date, format: "2016-01-14" "2016-01-07" ...
## $ Carrier : chr "OO" "WN" "DL" "WN" ...
## $ TailNum : chr "N8942A" "N663SW" "N942AT" "N221WN" ...
## $ FlightNum : int 4512 145 1118 2419 1364 21 835 1563 334 604 ...
## $ Origin : chr "MSP" "ABQ" "MIA" "IND" ...
## $ OriginCityName : chr "Minneapolis, MN" "Albuquerque, NM" "Miami, FL" "Indianapolis, IN" ...
## $ OriginState : chr "MN" "NM" "FL" "IN" ...
## $ Dest : chr "FAR" "DAL" "LGA" "DAL" ...
## $ DestCityName : chr "Fargo, ND" "Dallas, TX" "New York, NY" "Dallas, TX" ...
## $ DestState : chr "ND" "TX" "NY" "TX" ...
## $ CRSDepTime : int 1354 555 1301 1405 1630 1940 950 725 805 605 ...
## $ DepTime : int 1355 604 1315 1403 1629 1933 1111 713 756 603 ...
## $ WheelsOff : int 1412 623 1342 1426 1643 1956 1127 731 817 628 ...
## $ WheelsOn : int 1455 842 1606 1525 1726 2148 1523 1328 908 902 ...
## $ CRSArrTime : int 1505 840 1605 1525 1751 2220 1424 1400 925 910 ...
## $ ArrTime : int 1501 846 1616 1529 1729 2156 1528 1332 916 908 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CRSElapsedTime : int 71 105 184 140 141 280 214 275 80 185 ...
## $ ActualElapsedTime: int 66 102 181 146 120 263 197 259 80 185 ...
## $ Distance : int 223 580 1096 755 692 1721 1475 2072 337 1053 ...
## $ new_CRSDepTime : POSIXct, format: "2016-01-14 13:54:00" "2016-01-07 05:55:00" ...
## $ new_CRSArrTime : POSIXct, format: "2016-01-14 15:05:00" "2016-01-07 08:40:00" ...
## $ new_DepTime : POSIXct, format: "2016-01-14 13:55:00" "2016-01-07 06:04:00" ...
## $ new_ArrTime : POSIXct, format: "2016-01-14 15:01:00" "2016-01-07 08:46:00" ...
## $ new_WheelsOff : POSIXct, format: "2016-01-14 14:12:00" "2016-01-07 06:23:00" ...
## $ new_WheelsOn : POSIXct, format: "2016-01-14 14:55:00" "2016-01-07 08:42:00" ...
#flights %>% filter(!complete.cases(.))
flights <- flights %>% filter(Cancelled == 0) %>% mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")))
head(flights %>% select(CRSDepTime, DepTime, DepDelay))
## CRSDepTime DepTime DepDelay
## 1 1354 1355 1
## 2 555 604 9
## 3 1301 1315 14
## 4 1405 1403 -2
## 5 1630 1629 -1
## 6 1940 1933 -7
flights <- flights %>%
mutate(new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
flights$new_CRSArrTime <- as.POSIXct(flights$new_CRSArrTime, format="%Y-%m-%d %H%M")
#head(flights %>% select(new_CRSArrTime, CRSArrTime))
flights <-flights %>% filter(Cancelled == 0) %>% mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)), new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)), new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)))
flights$new_DepTime <- as.POSIXct(flights$new_DepTime, format="%Y-%m-%d %H%M")
flights$new_ArrTime <- as.POSIXct(flights$new_ArrTime, format="%Y-%m-%d %H%M")
flights$new_WheelsOff <- as.POSIXct(flights$new_WheelsOff, format="%Y-%m-%d %H%M")
flights$new_WheelsOn <- as.POSIXct(flights$new_WheelsOn, format ="%Y-%m-%d %H%M")
head(flights)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName OriginState
## 1 2016-01-14 OO N8942A 4512 MSP Minneapolis, MN MN
## 2 2016-01-07 WN N663SW 145 ABQ Albuquerque, NM NM
## 3 2016-01-17 DL N942AT 1118 MIA Miami, FL FL
## 4 2016-01-16 WN N221WN 2419 IND Indianapolis, IN IN
## 5 2016-01-25 DL N949DL 1364 ATL Atlanta, GA GA
## 6 2016-01-10 AS N408AS 21 ORD Chicago, IL IL
## Dest DestCityName DestState CRSDepTime DepTime WheelsOff WheelsOn
## 1 FAR Fargo, ND ND 1354 1355 1412 1455
## 2 DAL Dallas, TX TX 555 604 623 842
## 3 LGA New York, NY NY 1301 1315 1342 1606
## 4 DAL Dallas, TX TX 1405 1403 1426 1525
## 5 MCI Kansas City, MO MO 1630 1629 1643 1726
## 6 SEA Seattle, WA WA 1940 1933 1956 2148
## CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime
## 1 1505 1501 0 0 71 66
## 2 840 846 0 0 105 102
## 3 1605 1616 0 0 184 181
## 4 1525 1529 0 0 140 146
## 5 1751 1729 0 0 141 120
## 6 2220 2156 0 0 280 263
## Distance new_CRSDepTime new_CRSArrTime new_DepTime
## 1 223 2016-01-14 13:54:00 2016-01-14 15:05:00 2016-01-14 13:55:00
## 2 580 2016-01-07 05:55:00 2016-01-07 08:40:00 2016-01-07 06:04:00
## 3 1096 2016-01-17 13:01:00 2016-01-17 16:05:00 2016-01-17 13:15:00
## 4 755 2016-01-16 14:05:00 2016-01-16 15:25:00 2016-01-16 14:03:00
## 5 692 2016-01-25 16:30:00 2016-01-25 17:51:00 2016-01-25 16:29:00
## 6 1721 2016-01-10 19:40:00 2016-01-10 22:20:00 2016-01-10 19:33:00
## new_ArrTime new_WheelsOff new_WheelsOn DepDelay
## 1 2016-01-14 15:01:00 2016-01-14 14:12:00 2016-01-14 14:55:00 1
## 2 2016-01-07 08:46:00 2016-01-07 06:23:00 2016-01-07 08:42:00 9
## 3 2016-01-17 16:16:00 2016-01-17 13:42:00 2016-01-17 16:06:00 14
## 4 2016-01-16 15:29:00 2016-01-16 14:26:00 2016-01-16 15:25:00 -2
## 5 2016-01-25 17:29:00 2016-01-25 16:43:00 2016-01-25 17:26:00 -1
## 6 2016-01-10 21:56:00 2016-01-10 19:56:00 2016-01-10 21:48:00 -7
#str(flights)
flights <- flights %>% filter(Cancelled == 0) %>% mutate(DepDelayMinutes = ifelse(DepDelay < 0, 0, DepDelay), DepDel15 = ifelse(DepDelay >= 15, 1, 0))
head(flights %>% select(DepDelay, DepDelayMinutes, DepDel15))
## DepDelay DepDelayMinutes DepDel15
## 1 1 1 0
## 2 9 9 0
## 3 14 14 0
## 4 -2 0 0
## 5 -1 0 0
## 6 -7 0 0
flights <- flights %>% filter(Cancelled == 0) %>% mutate(TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")), TaxiIn = as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins")), ArrDelay = as.integer(difftime(new_ArrTime, new_CRSArrTime, units = "mins")), ArrDelayMinutes = ifelse(ArrDelay < 0, 0, ArrDelay), ArrDel15 = ifelse(ArrDelay >= 15, 1,0), FlightTimeBuffer = CRSElapsedTime - ActualElapsedTime)
flights <- flights %>% filter(Cancelled == 0) %>% mutate(AirTime = ActualElapsedTime - TaxiOut - TaxiIn)
flights <- flights %>% filter(Cancelled == 0) %>% mutate(AirSpeed = Distance / (AirTime/60))
head(flights)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName OriginState
## 1 2016-01-14 OO N8942A 4512 MSP Minneapolis, MN MN
## 2 2016-01-07 WN N663SW 145 ABQ Albuquerque, NM NM
## 3 2016-01-17 DL N942AT 1118 MIA Miami, FL FL
## 4 2016-01-16 WN N221WN 2419 IND Indianapolis, IN IN
## 5 2016-01-25 DL N949DL 1364 ATL Atlanta, GA GA
## 6 2016-01-10 AS N408AS 21 ORD Chicago, IL IL
## Dest DestCityName DestState CRSDepTime DepTime WheelsOff WheelsOn
## 1 FAR Fargo, ND ND 1354 1355 1412 1455
## 2 DAL Dallas, TX TX 555 604 623 842
## 3 LGA New York, NY NY 1301 1315 1342 1606
## 4 DAL Dallas, TX TX 1405 1403 1426 1525
## 5 MCI Kansas City, MO MO 1630 1629 1643 1726
## 6 SEA Seattle, WA WA 1940 1933 1956 2148
## CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime
## 1 1505 1501 0 0 71 66
## 2 840 846 0 0 105 102
## 3 1605 1616 0 0 184 181
## 4 1525 1529 0 0 140 146
## 5 1751 1729 0 0 141 120
## 6 2220 2156 0 0 280 263
## Distance new_CRSDepTime new_CRSArrTime new_DepTime
## 1 223 2016-01-14 13:54:00 2016-01-14 15:05:00 2016-01-14 13:55:00
## 2 580 2016-01-07 05:55:00 2016-01-07 08:40:00 2016-01-07 06:04:00
## 3 1096 2016-01-17 13:01:00 2016-01-17 16:05:00 2016-01-17 13:15:00
## 4 755 2016-01-16 14:05:00 2016-01-16 15:25:00 2016-01-16 14:03:00
## 5 692 2016-01-25 16:30:00 2016-01-25 17:51:00 2016-01-25 16:29:00
## 6 1721 2016-01-10 19:40:00 2016-01-10 22:20:00 2016-01-10 19:33:00
## new_ArrTime new_WheelsOff new_WheelsOn DepDelay
## 1 2016-01-14 15:01:00 2016-01-14 14:12:00 2016-01-14 14:55:00 1
## 2 2016-01-07 08:46:00 2016-01-07 06:23:00 2016-01-07 08:42:00 9
## 3 2016-01-17 16:16:00 2016-01-17 13:42:00 2016-01-17 16:06:00 14
## 4 2016-01-16 15:29:00 2016-01-16 14:26:00 2016-01-16 15:25:00 -2
## 5 2016-01-25 17:29:00 2016-01-25 16:43:00 2016-01-25 17:26:00 -1
## 6 2016-01-10 21:56:00 2016-01-10 19:56:00 2016-01-10 21:48:00 -7
## DepDelayMinutes DepDel15 TaxiOut TaxiIn ArrDelay ArrDelayMinutes
## 1 1 0 17 6 -4 0
## 2 9 0 19 4 6 6
## 3 14 0 27 10 11 11
## 4 0 0 23 4 4 4
## 5 0 0 14 3 -22 0
## 6 0 0 23 8 -24 0
## ArrDel15 FlightTimeBuffer AirTime AirSpeed
## 1 0 5 43 311.1628
## 2 0 3 79 440.5063
## 3 0 3 144 456.6667
## 4 0 -6 119 380.6723
## 5 0 21 103 403.1068
## 6 0 17 232 445.0862
str(flights)
## 'data.frame': 484 obs. of 38 variables:
## $ FlightDate : Date, format: "2016-01-14" "2016-01-07" ...
## $ Carrier : chr "OO" "WN" "DL" "WN" ...
## $ TailNum : chr "N8942A" "N663SW" "N942AT" "N221WN" ...
## $ FlightNum : int 4512 145 1118 2419 1364 21 835 1563 334 604 ...
## $ Origin : chr "MSP" "ABQ" "MIA" "IND" ...
## $ OriginCityName : chr "Minneapolis, MN" "Albuquerque, NM" "Miami, FL" "Indianapolis, IN" ...
## $ OriginState : chr "MN" "NM" "FL" "IN" ...
## $ Dest : chr "FAR" "DAL" "LGA" "DAL" ...
## $ DestCityName : chr "Fargo, ND" "Dallas, TX" "New York, NY" "Dallas, TX" ...
## $ DestState : chr "ND" "TX" "NY" "TX" ...
## $ CRSDepTime : int 1354 555 1301 1405 1630 1940 950 725 805 605 ...
## $ DepTime : int 1355 604 1315 1403 1629 1933 1111 713 756 603 ...
## $ WheelsOff : int 1412 623 1342 1426 1643 1956 1127 731 817 628 ...
## $ WheelsOn : int 1455 842 1606 1525 1726 2148 1523 1328 908 902 ...
## $ CRSArrTime : int 1505 840 1605 1525 1751 2220 1424 1400 925 910 ...
## $ ArrTime : int 1501 846 1616 1529 1729 2156 1528 1332 916 908 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CRSElapsedTime : int 71 105 184 140 141 280 214 275 80 185 ...
## $ ActualElapsedTime: int 66 102 181 146 120 263 197 259 80 185 ...
## $ Distance : int 223 580 1096 755 692 1721 1475 2072 337 1053 ...
## $ new_CRSDepTime : POSIXct, format: "2016-01-14 13:54:00" "2016-01-07 05:55:00" ...
## $ new_CRSArrTime : POSIXct, format: "2016-01-14 15:05:00" "2016-01-07 08:40:00" ...
## $ new_DepTime : POSIXct, format: "2016-01-14 13:55:00" "2016-01-07 06:04:00" ...
## $ new_ArrTime : POSIXct, format: "2016-01-14 15:01:00" "2016-01-07 08:46:00" ...
## $ new_WheelsOff : POSIXct, format: "2016-01-14 14:12:00" "2016-01-07 06:23:00" ...
## $ new_WheelsOn : POSIXct, format: "2016-01-14 14:55:00" "2016-01-07 08:42:00" ...
## $ DepDelay : int 1 9 14 -2 -1 -7 81 -12 -9 -2 ...
## $ DepDelayMinutes : num 1 9 14 0 0 0 81 0 0 0 ...
## $ DepDel15 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ TaxiOut : int 17 19 27 23 14 23 16 18 21 25 ...
## $ TaxiIn : int 6 4 10 4 3 8 5 4 8 6 ...
## $ ArrDelay : int -4 6 11 4 -22 -24 64 -28 -9 -2 ...
## $ ArrDelayMinutes : num 0 6 11 4 0 0 64 0 0 0 ...
## $ ArrDel15 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ FlightTimeBuffer : int 5 3 3 -6 21 17 17 16 0 0 ...
## $ AirTime : int 43 79 144 119 103 232 176 237 51 154 ...
## $ AirSpeed : num 311 441 457 381 403 ...
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
flights2 <- flights %>% filter(Cancelled == 0)
library(ggvis)
q1 <- flights2 %>% group_by(day = wday(FlightDate, label = TRUE)) %>% mutate(delayed = ifelse(DepDelay > 0, 1, 0)) %>% summarize(perc_delay = sum(delayed)/ n())
q1 %>% ggvis(~day, ~perc_delay) %>% layer_bars()
flights2 %>% mutate(delayed = ifelse(DepDelay > 0, 1, 0), time_part = ifelse(CRSDepTime > 1200, "Afternoon/Evening", "Morning")) %>% group_by(time_part) %>% summarize(perc_delay = sum(delayed) / n())
## # A tibble: 2 × 2
## time_part perc_delay
## <chr> <dbl>
## 1 Afternoon/Evening 0.3745318
## 2 Morning 0.2718894
library(chron)
##
## Attaching package: 'chron'
## The following objects are masked from 'package:lubridate':
##
## days, hours, minutes, seconds, years
flights2 %>%group_by(is.weekend(FlightDate)) %>% mutate(delayed = ifelse(DepDelay > 0,1,0)) %>% summarize(perc_delay = sum(delayed)/ n())
## # A tibble: 2 × 2
## `is.weekend(FlightDate)` perc_delay
## <lgl> <dbl>
## 1 FALSE 0.3259053
## 2 TRUE 0.3360000