Read in the data and take all the information that will use for the project
Domestic <- read.csv("/Users/yusufsultan/RStudio/rWork/domestic_flights_jan_2016.csv", header = TRUE, stringsAsFactors = FALSE)
head(Domestic);tail(Domestic)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName
## 1 1/6/2016 AA N4YBAA 43 DFW Dallas/Fort Worth, TX
## 2 1/7/2016 AA N434AA 43 DFW Dallas/Fort Worth, TX
## 3 1/8/2016 AA N541AA 43 DFW Dallas/Fort Worth, TX
## 4 1/9/2016 AA N489AA 43 DFW Dallas/Fort Worth, TX
## 5 1/10/2016 AA N439AA 43 DFW Dallas/Fort Worth, TX
## 6 1/11/2016 AA N468AA 43 DFW Dallas/Fort Worth, TX
## OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1 TX DTW Detroit, MI MI 1100 1057 1112
## 2 TX DTW Detroit, MI MI 1100 1056 1110
## 3 TX DTW Detroit, MI MI 1100 1055 1116
## 4 TX DTW Detroit, MI MI 1100 1102 1115
## 5 TX DTW Detroit, MI MI 1100 1240 1300
## 6 TX DTW Detroit, MI MI 1100 1107 1118
## WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1 1424 1438 1432 0 0 158
## 2 1416 1438 1426 0 0 158
## 3 1431 1438 1445 0 0 158
## 4 1424 1438 1433 0 0 158
## 5 1617 1438 1631 0 0 158
## 6 1426 1438 1435 0 0 158
## ActualElapsedTime Distance
## 1 155 986
## 2 150 986
## 3 170 986
## 4 151 986
## 5 171 986
## 6 148 986
## FlightDate Carrier TailNum FlightNum Origin
## 445822 1/3/2016 F9 N934FR 576 SLC
## 445823 1/3/2016 F9 N934FR 579 DEN
## 445824 1/3/2016 F9 N934FR 720 DEN
## 445825 1/3/2016 F9 N935FR 444 DEN
## 445826 1/3/2016 F9 N935FR 445 FSD
## 445827 1/3/2016 F9 N935FR 693 CID
## OriginCityName OriginState Dest DestCityName
## 445822 Salt Lake City, UT UT DEN Denver, CO
## 445823 Denver, CO CO SLC Salt Lake City, UT
## 445824 Denver, CO CO DCA Washington, DC
## 445825 Denver, CO CO FSD Sioux Falls, SD
## 445826 Sioux Falls, SD SD DEN Denver, CO
## 445827 Cedar Rapids/Iowa City, IA IA DEN Denver, CO
## DestState CRSDepTime DepTime WheelsOff WheelsOn CRSArrTime ArrTime
## 445822 CO 1449 1450 1500 1600 1609 1616
## 445823 UT 1245 1240 1249 1353 1409 1404
## 445824 VA 1649 1728 1745 2233 2159 2241
## 445825 SD 1020 1038 1050 1255 1250 1301
## 445826 CO 1335 1358 1408 1417 1426 1429
## 445827 CO 1909 1900 1920 1953 2029 2000
## Cancelled Diverted CRSElapsedTime ActualElapsedTime Distance
## 445822 0 0 80 86 391
## 445823 0 0 84 84 391
## 445824 0 0 190 193 1476
## 445825 0 0 90 83 483
## 445826 0 0 111 91 483
## 445827 0 0 140 120 692
First thing before we start , we want to check for missing data patterns.This step is help to see if there is any data missing or if there are amy mistake in our data.The library dplyr used to use the filter function with our data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Domestic %>% filter(!complete.cases(.)) %>% head()
## FlightDate Carrier TailNum FlightNum Origin OriginCityName OriginState
## 1 1/16/2016 AA N3CXAA 44 SEA Seattle, WA WA
## 2 1/23/2016 AA 44 SEA Seattle, WA WA
## 3 1/24/2016 AA N3DGAA 44 SEA Seattle, WA WA
## 4 1/25/2016 AA N3MNAA 44 SEA Seattle, WA WA
## 5 1/15/2016 AA N3JSAA 45 JFK New York, NY NY
## 6 1/23/2016 AA 45 JFK New York, NY NY
## Dest DestCityName DestState CRSDepTime DepTime WheelsOff WheelsOn
## 1 JFK New York, NY NY 640 NA NA NA
## 2 JFK New York, NY NY 640 NA NA NA
## 3 JFK New York, NY NY 645 NA NA NA
## 4 JFK New York, NY NY 645 NA NA NA
## 5 SEA Seattle, WA WA 1830 NA NA NA
## 6 SEA Seattle, WA WA 1830 NA NA NA
## CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime
## 1 1501 NA 1 0 321 NA
## 2 1501 NA 1 0 321 NA
## 3 1506 NA 1 0 321 NA
## 4 1506 NA 1 0 321 NA
## 5 2152 NA 1 0 382 NA
## 6 2152 NA 1 0 382 NA
## Distance
## 1 2422
## 2 2422
## 3 2422
## 4 2422
## 5 2422
## 6 2422
Now all the missing data is despair and we will see NA values in the fields so we have to filter this values before doing the calculation
library(dplyr)
Domestic$FlightDate <- as.Date(Domestic$FlightDate, format = "%m/%d/%Y")
Domestic %>% filter(Cancelled == 00) %>% summarize(MaxScheduled_dep = max(CRSDepTime), MaxActual_dep = max(DepTime),
Total = MaxScheduled_dep+MaxActual_dep)
## MaxScheduled_dep MaxActual_dep Total
## 1 2359 2400 4759
Domestic %>% summarize(TotalCancelled_flight = sum(Cancelled == 1 ),
TotalDiverted_flight = sum(Diverted == 1),
Total = TotalCancelled_flight+TotalDiverted_flight)
## TotalCancelled_flight TotalDiverted_flight Total
## 1 11665 864 12529
str(Domestic)
## 'data.frame': 445827 obs. of 21 variables:
## $ FlightDate : Date, format: "2016-01-06" "2016-01-07" ...
## $ Carrier : chr "AA" "AA" "AA" "AA" ...
## $ TailNum : chr "N4YBAA" "N434AA" "N541AA" "N489AA" ...
## $ FlightNum : int 43 43 43 43 43 43 43 43 43 43 ...
## $ Origin : chr "DFW" "DFW" "DFW" "DFW" ...
## $ OriginCityName : chr "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" ...
## $ OriginState : chr "TX" "TX" "TX" "TX" ...
## $ Dest : chr "DTW" "DTW" "DTW" "DTW" ...
## $ DestCityName : chr "Detroit, MI" "Detroit, MI" "Detroit, MI" "Detroit, MI" ...
## $ DestState : chr "MI" "MI" "MI" "MI" ...
## $ CRSDepTime : int 1100 1100 1100 1100 1100 1100 1100 1100 1100 1100 ...
## $ DepTime : int 1057 1056 1055 1102 1240 1107 1059 1055 1058 1056 ...
## $ WheelsOff : int 1112 1110 1116 1115 1300 1118 1113 1107 1110 1110 ...
## $ WheelsOn : int 1424 1416 1431 1424 1617 1426 1429 1419 1420 1423 ...
## $ CRSArrTime : int 1438 1438 1438 1438 1438 1438 1438 1438 1438 1438 ...
## $ ArrTime : int 1432 1426 1445 1433 1631 1435 1438 1431 1428 1434 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CRSElapsedTime : int 158 158 158 158 158 158 158 158 158 158 ...
## $ ActualElapsedTime: int 155 150 170 151 171 148 159 156 150 158 ...
## $ Distance : int 986 986 986 986 986 986 986 986 986 986 ...
Domestic <- Domestic %>%
mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)))
Domestic$new_CRSDepTime <- as.POSIXct(Domestic$new_CRSDepTime, format="%Y-%m-%d %H%M")
head(Domestic %>% select(new_CRSDepTime, CRSDepTime))
## new_CRSDepTime CRSDepTime
## 1 2016-01-06 11:00:00 1100
## 2 2016-01-07 11:00:00 1100
## 3 2016-01-08 11:00:00 1100
## 4 2016-01-09 11:00:00 1100
## 5 2016-01-10 11:00:00 1100
## 6 2016-01-11 11:00:00 1100
tail(Domestic %>% select(new_CRSDepTime, CRSDepTime))
## new_CRSDepTime CRSDepTime
## 445822 2016-01-03 14:49:00 1449
## 445823 2016-01-03 12:45:00 1245
## 445824 2016-01-03 16:49:00 1649
## 445825 2016-01-03 10:20:00 1020
## 445826 2016-01-03 13:35:00 1335
## 445827 2016-01-03 19:09:00 1909
Domestic <- Domestic %>% filter(Cancelled == 0) %>%
mutate(new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
Domestic$new_CRSArrTime <- as.POSIXct(Domestic$new_CRSArrTime, format="%Y-%m-%d %H%M")
Domestic <- Domestic %>%
mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)),
new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)), new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)))
Domestic$new_DepTime <- as.POSIXct(Domestic$new_DepTime, format="%Y-%m-%d %H%M")
Domestic$new_WheelsOff <- as.POSIXct(Domestic$new_WheelsOff, format="%Y-%m-%d %H%M")
Domestic$new_WheelsOn <- as.POSIXct(Domestic$new_WheelsOn, format="%Y-%m-%d %H%M")
Domestic$new_ArrTime <- as.POSIXct(Domestic$new_ArrTime, format="%Y-%m-%d %H%M")
head(Domestic)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName
## 1 2016-01-06 AA N4YBAA 43 DFW Dallas/Fort Worth, TX
## 2 2016-01-07 AA N434AA 43 DFW Dallas/Fort Worth, TX
## 3 2016-01-08 AA N541AA 43 DFW Dallas/Fort Worth, TX
## 4 2016-01-09 AA N489AA 43 DFW Dallas/Fort Worth, TX
## 5 2016-01-10 AA N439AA 43 DFW Dallas/Fort Worth, TX
## 6 2016-01-11 AA N468AA 43 DFW Dallas/Fort Worth, TX
## OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1 TX DTW Detroit, MI MI 1100 1057 1112
## 2 TX DTW Detroit, MI MI 1100 1056 1110
## 3 TX DTW Detroit, MI MI 1100 1055 1116
## 4 TX DTW Detroit, MI MI 1100 1102 1115
## 5 TX DTW Detroit, MI MI 1100 1240 1300
## 6 TX DTW Detroit, MI MI 1100 1107 1118
## WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1 1424 1438 1432 0 0 158
## 2 1416 1438 1426 0 0 158
## 3 1431 1438 1445 0 0 158
## 4 1424 1438 1433 0 0 158
## 5 1617 1438 1631 0 0 158
## 6 1426 1438 1435 0 0 158
## ActualElapsedTime Distance new_CRSDepTime new_CRSArrTime
## 1 155 986 2016-01-06 11:00:00 2016-01-06 14:38:00
## 2 150 986 2016-01-07 11:00:00 2016-01-07 14:38:00
## 3 170 986 2016-01-08 11:00:00 2016-01-08 14:38:00
## 4 151 986 2016-01-09 11:00:00 2016-01-09 14:38:00
## 5 171 986 2016-01-10 11:00:00 2016-01-10 14:38:00
## 6 148 986 2016-01-11 11:00:00 2016-01-11 14:38:00
## new_DepTime new_WheelsOff new_WheelsOn
## 1 2016-01-06 10:57:00 2016-01-06 11:12:00 2016-01-06 14:24:00
## 2 2016-01-07 10:56:00 2016-01-07 11:10:00 2016-01-07 14:16:00
## 3 2016-01-08 10:55:00 2016-01-08 11:16:00 2016-01-08 14:31:00
## 4 2016-01-09 11:02:00 2016-01-09 11:15:00 2016-01-09 14:24:00
## 5 2016-01-10 12:40:00 2016-01-10 13:00:00 2016-01-10 16:17:00
## 6 2016-01-11 11:07:00 2016-01-11 11:18:00 2016-01-11 14:26:00
## new_ArrTime
## 1 2016-01-06 14:32:00
## 2 2016-01-07 14:26:00
## 3 2016-01-08 14:45:00
## 4 2016-01-09 14:33:00
## 5 2016-01-10 16:31:00
## 6 2016-01-11 14:35:00
tail(Domestic)
## FlightDate Carrier TailNum FlightNum Origin
## 434157 2016-01-03 F9 N934FR 576 SLC
## 434158 2016-01-03 F9 N934FR 579 DEN
## 434159 2016-01-03 F9 N934FR 720 DEN
## 434160 2016-01-03 F9 N935FR 444 DEN
## 434161 2016-01-03 F9 N935FR 445 FSD
## 434162 2016-01-03 F9 N935FR 693 CID
## OriginCityName OriginState Dest DestCityName
## 434157 Salt Lake City, UT UT DEN Denver, CO
## 434158 Denver, CO CO SLC Salt Lake City, UT
## 434159 Denver, CO CO DCA Washington, DC
## 434160 Denver, CO CO FSD Sioux Falls, SD
## 434161 Sioux Falls, SD SD DEN Denver, CO
## 434162 Cedar Rapids/Iowa City, IA IA DEN Denver, CO
## DestState CRSDepTime DepTime WheelsOff WheelsOn CRSArrTime ArrTime
## 434157 CO 1449 1450 1500 1600 1609 1616
## 434158 UT 1245 1240 1249 1353 1409 1404
## 434159 VA 1649 1728 1745 2233 2159 2241
## 434160 SD 1020 1038 1050 1255 1250 1301
## 434161 CO 1335 1358 1408 1417 1426 1429
## 434162 CO 1909 1900 1920 1953 2029 2000
## Cancelled Diverted CRSElapsedTime ActualElapsedTime Distance
## 434157 0 0 80 86 391
## 434158 0 0 84 84 391
## 434159 0 0 190 193 1476
## 434160 0 0 90 83 483
## 434161 0 0 111 91 483
## 434162 0 0 140 120 692
## new_CRSDepTime new_CRSArrTime new_DepTime
## 434157 2016-01-03 14:49:00 2016-01-03 16:09:00 2016-01-03 14:50:00
## 434158 2016-01-03 12:45:00 2016-01-03 14:09:00 2016-01-03 12:40:00
## 434159 2016-01-03 16:49:00 2016-01-03 21:59:00 2016-01-03 17:28:00
## 434160 2016-01-03 10:20:00 2016-01-03 12:50:00 2016-01-03 10:38:00
## 434161 2016-01-03 13:35:00 2016-01-03 14:26:00 2016-01-03 13:58:00
## 434162 2016-01-03 19:09:00 2016-01-03 20:29:00 2016-01-03 19:00:00
## new_WheelsOff new_WheelsOn new_ArrTime
## 434157 2016-01-03 15:00:00 2016-01-03 16:00:00 2016-01-03 16:16:00
## 434158 2016-01-03 12:49:00 2016-01-03 13:53:00 2016-01-03 14:04:00
## 434159 2016-01-03 17:45:00 2016-01-03 22:33:00 2016-01-03 22:41:00
## 434160 2016-01-03 10:50:00 2016-01-03 12:55:00 2016-01-03 13:01:00
## 434161 2016-01-03 14:08:00 2016-01-03 14:17:00 2016-01-03 14:29:00
## 434162 2016-01-03 19:20:00 2016-01-03 19:53:00 2016-01-03 20:00:00
Now the calculations should be fairly straightforward
DepDelay = new_DepTime - new_CRSDepTime
DepDelayMinutes - ifelse(DepDelay < 0, 0, DepDelay)
DepDel15Â -Â ifelse(DepDelay >= 15, 1, 0)
TaxiOut - new_WheelsOff - new_DepTime
TaxiIn - new_ArrTime - new_WheelsOn
ArrDelay - new_ArrTime - new_ArrDepTime
ArrDelayMinutes - ifelse(ArrDelay < 0, 0, ArrDelay)
ArrDel15Â -Â ifelse(ArrDelay >= 15, 1, 0)
FlightTimeBuffer - CRSElapsedTime - ActualElapsedTime
AirTime - ActualElapsedTime - TaxiOut - TaxiIn
AirSpeed - Distance / (Airtime / 60)
Domestic <- Domestic %>% filter(Cancelled == 0) %>%
mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")))
head(Domestic %>% select(CRSDepTime, DepTime, DepDelay))
## CRSDepTime DepTime DepDelay
## 1 1100 1057 -3
## 2 1100 1056 -4
## 3 1100 1055 -5
## 4 1100 1102 2
## 5 1100 1240 100
## 6 1100 1107 7
Now we need to do the same for DepTime, WheelsOff, WheelsOn, CRSArrTime, and ArrTime.
Domestic <- Domestic %>% filter(Cancelled == 0) %>%
mutate(DepDelayMinutes = ifelse(DepDelay < 0, 0, DepDelay),
DepDel10 = ifelse(DepDelay >= 10, 1, 0))
head(Domestic %>% select(DepDelay, DepDelayMinutes, DepDel10))
## DepDelay DepDelayMinutes DepDel10
## 1 -3 0 0
## 2 -4 0 0
## 3 -5 0 0
## 4 2 2 0
## 5 100 100 1
## 6 7 7 0
Now we need to do the same for DepTime, WheelsOff, WheelsOn, CRSArrTime, and ArrTime.
Domestic <- Domestic %>% filter(Cancelled == 0) %>%
mutate(TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")),
TaxiIn = as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins")),
ArrDelay = as.integer(difftime(new_ArrTime, new_CRSArrTime, units = "mins")),
ArrDelayMinutes = ifelse(ArrDelay < 0, 0, ArrDelay),
ArrDel10 = ifelse(ArrDelay >= 10, 1, 0),
FlightTimeBuffer = CRSElapsedTime - ActualElapsedTime)
Domestic<- Domestic %>% filter(Cancelled == 0) %>%
mutate(AirTime = ActualElapsedTime - TaxiOut - TaxiIn)
Domestic <- Domestic %>% filter(Cancelled == 0) %>%
mutate(AirSpeed = Distance / (AirTime / 60))
head(Domestic)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName
## 1 2016-01-06 AA N4YBAA 43 DFW Dallas/Fort Worth, TX
## 2 2016-01-07 AA N434AA 43 DFW Dallas/Fort Worth, TX
## 3 2016-01-08 AA N541AA 43 DFW Dallas/Fort Worth, TX
## 4 2016-01-09 AA N489AA 43 DFW Dallas/Fort Worth, TX
## 5 2016-01-10 AA N439AA 43 DFW Dallas/Fort Worth, TX
## 6 2016-01-11 AA N468AA 43 DFW Dallas/Fort Worth, TX
## OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1 TX DTW Detroit, MI MI 1100 1057 1112
## 2 TX DTW Detroit, MI MI 1100 1056 1110
## 3 TX DTW Detroit, MI MI 1100 1055 1116
## 4 TX DTW Detroit, MI MI 1100 1102 1115
## 5 TX DTW Detroit, MI MI 1100 1240 1300
## 6 TX DTW Detroit, MI MI 1100 1107 1118
## WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1 1424 1438 1432 0 0 158
## 2 1416 1438 1426 0 0 158
## 3 1431 1438 1445 0 0 158
## 4 1424 1438 1433 0 0 158
## 5 1617 1438 1631 0 0 158
## 6 1426 1438 1435 0 0 158
## ActualElapsedTime Distance new_CRSDepTime new_CRSArrTime
## 1 155 986 2016-01-06 11:00:00 2016-01-06 14:38:00
## 2 150 986 2016-01-07 11:00:00 2016-01-07 14:38:00
## 3 170 986 2016-01-08 11:00:00 2016-01-08 14:38:00
## 4 151 986 2016-01-09 11:00:00 2016-01-09 14:38:00
## 5 171 986 2016-01-10 11:00:00 2016-01-10 14:38:00
## 6 148 986 2016-01-11 11:00:00 2016-01-11 14:38:00
## new_DepTime new_WheelsOff new_WheelsOn
## 1 2016-01-06 10:57:00 2016-01-06 11:12:00 2016-01-06 14:24:00
## 2 2016-01-07 10:56:00 2016-01-07 11:10:00 2016-01-07 14:16:00
## 3 2016-01-08 10:55:00 2016-01-08 11:16:00 2016-01-08 14:31:00
## 4 2016-01-09 11:02:00 2016-01-09 11:15:00 2016-01-09 14:24:00
## 5 2016-01-10 12:40:00 2016-01-10 13:00:00 2016-01-10 16:17:00
## 6 2016-01-11 11:07:00 2016-01-11 11:18:00 2016-01-11 14:26:00
## new_ArrTime DepDelay DepDelayMinutes DepDel10 TaxiOut TaxiIn
## 1 2016-01-06 14:32:00 -3 0 0 15 8
## 2 2016-01-07 14:26:00 -4 0 0 14 10
## 3 2016-01-08 14:45:00 -5 0 0 21 14
## 4 2016-01-09 14:33:00 2 2 0 13 9
## 5 2016-01-10 16:31:00 100 100 1 20 14
## 6 2016-01-11 14:35:00 7 7 0 11 9
## ArrDelay ArrDelayMinutes ArrDel10 FlightTimeBuffer AirTime AirSpeed
## 1 -6 0 0 3 132 448.1818
## 2 -12 0 0 8 126 469.5238
## 3 7 7 0 -12 135 438.2222
## 4 -5 0 0 7 129 458.6047
## 5 113 113 1 -13 137 431.8248
## 6 -3 0 0 10 128 462.1875
Visualize data with different variables .
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggvis)
Domestic2 <- Domestic %>% filter(Cancelled == 0)
Q1 <- Domestic2 %>% group_by(Days = wday(FlightDate, label=TRUE)) %>%
mutate(Delayed = ifelse(DepDelayMinutes > 0, 1, 0)) %>%
summarize(DelayedPerc = sum(Delayed) / n())
Q1 %>% ggvis(~Days, ~DelayedPerc ,fill:="#922B21") %>% layer_bars() %>% add_axis("x", title = "Dayes") %>%
add_axis("y", title = "Delay Percentage")
Domestic2 %>% ggvis(~Carrier, ~ArrDel10,fill:="#8E44AD") %>% layer_bars() %>% layer_bars() %>% add_axis("x", title = "Airline Companies") %>%
add_axis("y", title = "Arrive Delay 10 minutes")
Domestic2 %>% ggvis(~Carrier, ~DepDel10,fill:="#E67E22") %>% layer_bars() %>% layer_bars() %>% add_axis("x", title = "Airline Companies") %>%
add_axis("y", title = "Departure Delay 10 minutes")
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:ggvis':
##
## resolution
g <- ggplot(Domestic2, aes(Carrier , fill =Carrier ,group = Carrier)) +
geom_bar()
g