Read in the data and take all the information that will use for the project

Domestic <- read.csv("/Users/yusufsultan/RStudio/rWork/domestic_flights_jan_2016.csv", header = TRUE, stringsAsFactors = FALSE)
head(Domestic);tail(Domestic)
##   FlightDate Carrier TailNum FlightNum Origin        OriginCityName
## 1   1/6/2016      AA  N4YBAA        43    DFW Dallas/Fort Worth, TX
## 2   1/7/2016      AA  N434AA        43    DFW Dallas/Fort Worth, TX
## 3   1/8/2016      AA  N541AA        43    DFW Dallas/Fort Worth, TX
## 4   1/9/2016      AA  N489AA        43    DFW Dallas/Fort Worth, TX
## 5  1/10/2016      AA  N439AA        43    DFW Dallas/Fort Worth, TX
## 6  1/11/2016      AA  N468AA        43    DFW Dallas/Fort Worth, TX
##   OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1          TX  DTW  Detroit, MI        MI       1100    1057      1112
## 2          TX  DTW  Detroit, MI        MI       1100    1056      1110
## 3          TX  DTW  Detroit, MI        MI       1100    1055      1116
## 4          TX  DTW  Detroit, MI        MI       1100    1102      1115
## 5          TX  DTW  Detroit, MI        MI       1100    1240      1300
## 6          TX  DTW  Detroit, MI        MI       1100    1107      1118
##   WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1     1424       1438    1432         0        0            158
## 2     1416       1438    1426         0        0            158
## 3     1431       1438    1445         0        0            158
## 4     1424       1438    1433         0        0            158
## 5     1617       1438    1631         0        0            158
## 6     1426       1438    1435         0        0            158
##   ActualElapsedTime Distance
## 1               155      986
## 2               150      986
## 3               170      986
## 4               151      986
## 5               171      986
## 6               148      986
##        FlightDate Carrier TailNum FlightNum Origin
## 445822   1/3/2016      F9  N934FR       576    SLC
## 445823   1/3/2016      F9  N934FR       579    DEN
## 445824   1/3/2016      F9  N934FR       720    DEN
## 445825   1/3/2016      F9  N935FR       444    DEN
## 445826   1/3/2016      F9  N935FR       445    FSD
## 445827   1/3/2016      F9  N935FR       693    CID
##                    OriginCityName OriginState Dest       DestCityName
## 445822         Salt Lake City, UT          UT  DEN         Denver, CO
## 445823                 Denver, CO          CO  SLC Salt Lake City, UT
## 445824                 Denver, CO          CO  DCA     Washington, DC
## 445825                 Denver, CO          CO  FSD    Sioux Falls, SD
## 445826            Sioux Falls, SD          SD  DEN         Denver, CO
## 445827 Cedar Rapids/Iowa City, IA          IA  DEN         Denver, CO
##        DestState CRSDepTime DepTime WheelsOff WheelsOn CRSArrTime ArrTime
## 445822        CO       1449    1450      1500     1600       1609    1616
## 445823        UT       1245    1240      1249     1353       1409    1404
## 445824        VA       1649    1728      1745     2233       2159    2241
## 445825        SD       1020    1038      1050     1255       1250    1301
## 445826        CO       1335    1358      1408     1417       1426    1429
## 445827        CO       1909    1900      1920     1953       2029    2000
##        Cancelled Diverted CRSElapsedTime ActualElapsedTime Distance
## 445822         0        0             80                86      391
## 445823         0        0             84                84      391
## 445824         0        0            190               193     1476
## 445825         0        0             90                83      483
## 445826         0        0            111                91      483
## 445827         0        0            140               120      692

First thing before we start , we want to check for missing data patterns.This step is help to see if there is any data missing or if there are amy mistake in our data.The library dplyr used to use the filter function with our data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Domestic %>% filter(!complete.cases(.)) %>% head()
##   FlightDate Carrier TailNum FlightNum Origin OriginCityName OriginState
## 1  1/16/2016      AA  N3CXAA        44    SEA    Seattle, WA          WA
## 2  1/23/2016      AA                44    SEA    Seattle, WA          WA
## 3  1/24/2016      AA  N3DGAA        44    SEA    Seattle, WA          WA
## 4  1/25/2016      AA  N3MNAA        44    SEA    Seattle, WA          WA
## 5  1/15/2016      AA  N3JSAA        45    JFK   New York, NY          NY
## 6  1/23/2016      AA                45    JFK   New York, NY          NY
##   Dest DestCityName DestState CRSDepTime DepTime WheelsOff WheelsOn
## 1  JFK New York, NY        NY        640      NA        NA       NA
## 2  JFK New York, NY        NY        640      NA        NA       NA
## 3  JFK New York, NY        NY        645      NA        NA       NA
## 4  JFK New York, NY        NY        645      NA        NA       NA
## 5  SEA  Seattle, WA        WA       1830      NA        NA       NA
## 6  SEA  Seattle, WA        WA       1830      NA        NA       NA
##   CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime
## 1       1501      NA         1        0            321                NA
## 2       1501      NA         1        0            321                NA
## 3       1506      NA         1        0            321                NA
## 4       1506      NA         1        0            321                NA
## 5       2152      NA         1        0            382                NA
## 6       2152      NA         1        0            382                NA
##   Distance
## 1     2422
## 2     2422
## 3     2422
## 4     2422
## 5     2422
## 6     2422
Now all the missing data is despair and we will see NA values in the fields so we have to filter this values before doing the calculation
library(dplyr)
Domestic$FlightDate <- as.Date(Domestic$FlightDate, format = "%m/%d/%Y")
Domestic %>% filter(Cancelled == 00) %>% summarize(MaxScheduled_dep = max(CRSDepTime), MaxActual_dep = max(DepTime), 
Total = MaxScheduled_dep+MaxActual_dep)
##   MaxScheduled_dep MaxActual_dep Total
## 1             2359          2400  4759
Domestic  %>% summarize(TotalCancelled_flight = sum(Cancelled == 1 ),
TotalDiverted_flight = sum(Diverted == 1),
Total = TotalCancelled_flight+TotalDiverted_flight)
##   TotalCancelled_flight TotalDiverted_flight Total
## 1                 11665                  864 12529
str(Domestic)
## 'data.frame':    445827 obs. of  21 variables:
##  $ FlightDate       : Date, format: "2016-01-06" "2016-01-07" ...
##  $ Carrier          : chr  "AA" "AA" "AA" "AA" ...
##  $ TailNum          : chr  "N4YBAA" "N434AA" "N541AA" "N489AA" ...
##  $ FlightNum        : int  43 43 43 43 43 43 43 43 43 43 ...
##  $ Origin           : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ OriginCityName   : chr  "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" ...
##  $ OriginState      : chr  "TX" "TX" "TX" "TX" ...
##  $ Dest             : chr  "DTW" "DTW" "DTW" "DTW" ...
##  $ DestCityName     : chr  "Detroit, MI" "Detroit, MI" "Detroit, MI" "Detroit, MI" ...
##  $ DestState        : chr  "MI" "MI" "MI" "MI" ...
##  $ CRSDepTime       : int  1100 1100 1100 1100 1100 1100 1100 1100 1100 1100 ...
##  $ DepTime          : int  1057 1056 1055 1102 1240 1107 1059 1055 1058 1056 ...
##  $ WheelsOff        : int  1112 1110 1116 1115 1300 1118 1113 1107 1110 1110 ...
##  $ WheelsOn         : int  1424 1416 1431 1424 1617 1426 1429 1419 1420 1423 ...
##  $ CRSArrTime       : int  1438 1438 1438 1438 1438 1438 1438 1438 1438 1438 ...
##  $ ArrTime          : int  1432 1426 1445 1433 1631 1435 1438 1431 1428 1434 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRSElapsedTime   : int  158 158 158 158 158 158 158 158 158 158 ...
##  $ ActualElapsedTime: int  155 150 170 151 171 148 159 156 150 158 ...
##  $ Distance         : int  986 986 986 986 986 986 986 986 986 986 ...
Domestic <- Domestic %>% 
  mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)))
Domestic$new_CRSDepTime <- as.POSIXct(Domestic$new_CRSDepTime, format="%Y-%m-%d %H%M")
head(Domestic %>% select(new_CRSDepTime, CRSDepTime))
##        new_CRSDepTime CRSDepTime
## 1 2016-01-06 11:00:00       1100
## 2 2016-01-07 11:00:00       1100
## 3 2016-01-08 11:00:00       1100
## 4 2016-01-09 11:00:00       1100
## 5 2016-01-10 11:00:00       1100
## 6 2016-01-11 11:00:00       1100
tail(Domestic %>% select(new_CRSDepTime, CRSDepTime))
##             new_CRSDepTime CRSDepTime
## 445822 2016-01-03 14:49:00       1449
## 445823 2016-01-03 12:45:00       1245
## 445824 2016-01-03 16:49:00       1649
## 445825 2016-01-03 10:20:00       1020
## 445826 2016-01-03 13:35:00       1335
## 445827 2016-01-03 19:09:00       1909
Domestic <- Domestic %>% filter(Cancelled == 0) %>%
  mutate(new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
Domestic$new_CRSArrTime <- as.POSIXct(Domestic$new_CRSArrTime, format="%Y-%m-%d %H%M")
Domestic <- Domestic %>%   
  mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)),
new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)), new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)))
Domestic$new_DepTime <- as.POSIXct(Domestic$new_DepTime, format="%Y-%m-%d %H%M")
Domestic$new_WheelsOff <- as.POSIXct(Domestic$new_WheelsOff, format="%Y-%m-%d %H%M")
Domestic$new_WheelsOn <- as.POSIXct(Domestic$new_WheelsOn, format="%Y-%m-%d %H%M")
Domestic$new_ArrTime <- as.POSIXct(Domestic$new_ArrTime, format="%Y-%m-%d %H%M")
head(Domestic)
##   FlightDate Carrier TailNum FlightNum Origin        OriginCityName
## 1 2016-01-06      AA  N4YBAA        43    DFW Dallas/Fort Worth, TX
## 2 2016-01-07      AA  N434AA        43    DFW Dallas/Fort Worth, TX
## 3 2016-01-08      AA  N541AA        43    DFW Dallas/Fort Worth, TX
## 4 2016-01-09      AA  N489AA        43    DFW Dallas/Fort Worth, TX
## 5 2016-01-10      AA  N439AA        43    DFW Dallas/Fort Worth, TX
## 6 2016-01-11      AA  N468AA        43    DFW Dallas/Fort Worth, TX
##   OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1          TX  DTW  Detroit, MI        MI       1100    1057      1112
## 2          TX  DTW  Detroit, MI        MI       1100    1056      1110
## 3          TX  DTW  Detroit, MI        MI       1100    1055      1116
## 4          TX  DTW  Detroit, MI        MI       1100    1102      1115
## 5          TX  DTW  Detroit, MI        MI       1100    1240      1300
## 6          TX  DTW  Detroit, MI        MI       1100    1107      1118
##   WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1     1424       1438    1432         0        0            158
## 2     1416       1438    1426         0        0            158
## 3     1431       1438    1445         0        0            158
## 4     1424       1438    1433         0        0            158
## 5     1617       1438    1631         0        0            158
## 6     1426       1438    1435         0        0            158
##   ActualElapsedTime Distance      new_CRSDepTime      new_CRSArrTime
## 1               155      986 2016-01-06 11:00:00 2016-01-06 14:38:00
## 2               150      986 2016-01-07 11:00:00 2016-01-07 14:38:00
## 3               170      986 2016-01-08 11:00:00 2016-01-08 14:38:00
## 4               151      986 2016-01-09 11:00:00 2016-01-09 14:38:00
## 5               171      986 2016-01-10 11:00:00 2016-01-10 14:38:00
## 6               148      986 2016-01-11 11:00:00 2016-01-11 14:38:00
##           new_DepTime       new_WheelsOff        new_WheelsOn
## 1 2016-01-06 10:57:00 2016-01-06 11:12:00 2016-01-06 14:24:00
## 2 2016-01-07 10:56:00 2016-01-07 11:10:00 2016-01-07 14:16:00
## 3 2016-01-08 10:55:00 2016-01-08 11:16:00 2016-01-08 14:31:00
## 4 2016-01-09 11:02:00 2016-01-09 11:15:00 2016-01-09 14:24:00
## 5 2016-01-10 12:40:00 2016-01-10 13:00:00 2016-01-10 16:17:00
## 6 2016-01-11 11:07:00 2016-01-11 11:18:00 2016-01-11 14:26:00
##           new_ArrTime
## 1 2016-01-06 14:32:00
## 2 2016-01-07 14:26:00
## 3 2016-01-08 14:45:00
## 4 2016-01-09 14:33:00
## 5 2016-01-10 16:31:00
## 6 2016-01-11 14:35:00
tail(Domestic)
##        FlightDate Carrier TailNum FlightNum Origin
## 434157 2016-01-03      F9  N934FR       576    SLC
## 434158 2016-01-03      F9  N934FR       579    DEN
## 434159 2016-01-03      F9  N934FR       720    DEN
## 434160 2016-01-03      F9  N935FR       444    DEN
## 434161 2016-01-03      F9  N935FR       445    FSD
## 434162 2016-01-03      F9  N935FR       693    CID
##                    OriginCityName OriginState Dest       DestCityName
## 434157         Salt Lake City, UT          UT  DEN         Denver, CO
## 434158                 Denver, CO          CO  SLC Salt Lake City, UT
## 434159                 Denver, CO          CO  DCA     Washington, DC
## 434160                 Denver, CO          CO  FSD    Sioux Falls, SD
## 434161            Sioux Falls, SD          SD  DEN         Denver, CO
## 434162 Cedar Rapids/Iowa City, IA          IA  DEN         Denver, CO
##        DestState CRSDepTime DepTime WheelsOff WheelsOn CRSArrTime ArrTime
## 434157        CO       1449    1450      1500     1600       1609    1616
## 434158        UT       1245    1240      1249     1353       1409    1404
## 434159        VA       1649    1728      1745     2233       2159    2241
## 434160        SD       1020    1038      1050     1255       1250    1301
## 434161        CO       1335    1358      1408     1417       1426    1429
## 434162        CO       1909    1900      1920     1953       2029    2000
##        Cancelled Diverted CRSElapsedTime ActualElapsedTime Distance
## 434157         0        0             80                86      391
## 434158         0        0             84                84      391
## 434159         0        0            190               193     1476
## 434160         0        0             90                83      483
## 434161         0        0            111                91      483
## 434162         0        0            140               120      692
##             new_CRSDepTime      new_CRSArrTime         new_DepTime
## 434157 2016-01-03 14:49:00 2016-01-03 16:09:00 2016-01-03 14:50:00
## 434158 2016-01-03 12:45:00 2016-01-03 14:09:00 2016-01-03 12:40:00
## 434159 2016-01-03 16:49:00 2016-01-03 21:59:00 2016-01-03 17:28:00
## 434160 2016-01-03 10:20:00 2016-01-03 12:50:00 2016-01-03 10:38:00
## 434161 2016-01-03 13:35:00 2016-01-03 14:26:00 2016-01-03 13:58:00
## 434162 2016-01-03 19:09:00 2016-01-03 20:29:00 2016-01-03 19:00:00
##              new_WheelsOff        new_WheelsOn         new_ArrTime
## 434157 2016-01-03 15:00:00 2016-01-03 16:00:00 2016-01-03 16:16:00
## 434158 2016-01-03 12:49:00 2016-01-03 13:53:00 2016-01-03 14:04:00
## 434159 2016-01-03 17:45:00 2016-01-03 22:33:00 2016-01-03 22:41:00
## 434160 2016-01-03 10:50:00 2016-01-03 12:55:00 2016-01-03 13:01:00
## 434161 2016-01-03 14:08:00 2016-01-03 14:17:00 2016-01-03 14:29:00
## 434162 2016-01-03 19:20:00 2016-01-03 19:53:00 2016-01-03 20:00:00
Domestic <- Domestic %>% filter(Cancelled == 0) %>%
  mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")))
head(Domestic %>% select(CRSDepTime, DepTime, DepDelay))
##   CRSDepTime DepTime DepDelay
## 1       1100    1057       -3
## 2       1100    1056       -4
## 3       1100    1055       -5
## 4       1100    1102        2
## 5       1100    1240      100
## 6       1100    1107        7

Now we need to do the same for DepTime, WheelsOff, WheelsOn, CRSArrTime, and ArrTime.

Domestic <- Domestic %>% filter(Cancelled == 0) %>% 
  mutate(DepDelayMinutes = ifelse(DepDelay < 0, 0, DepDelay), 
         DepDel10 = ifelse(DepDelay >= 10, 1, 0))
head(Domestic %>% select(DepDelay, DepDelayMinutes, DepDel10))
##   DepDelay DepDelayMinutes DepDel10
## 1       -3               0        0
## 2       -4               0        0
## 3       -5               0        0
## 4        2               2        0
## 5      100             100        1
## 6        7               7        0

Now we need to do the same for DepTime, WheelsOff, WheelsOn, CRSArrTime, and ArrTime.

Domestic <- Domestic %>% filter(Cancelled == 0) %>% 
  mutate(TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")),
         TaxiIn = as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins")),
         ArrDelay = as.integer(difftime(new_ArrTime, new_CRSArrTime, units = "mins")),
         ArrDelayMinutes = ifelse(ArrDelay < 0, 0, ArrDelay), 
         ArrDel10 = ifelse(ArrDelay >= 10, 1, 0),
         FlightTimeBuffer = CRSElapsedTime - ActualElapsedTime)
Domestic<- Domestic %>% filter(Cancelled == 0) %>% 
  mutate(AirTime = ActualElapsedTime - TaxiOut - TaxiIn)
Domestic <- Domestic %>% filter(Cancelled == 0) %>% 
  mutate(AirSpeed = Distance / (AirTime / 60))
head(Domestic)
##   FlightDate Carrier TailNum FlightNum Origin        OriginCityName
## 1 2016-01-06      AA  N4YBAA        43    DFW Dallas/Fort Worth, TX
## 2 2016-01-07      AA  N434AA        43    DFW Dallas/Fort Worth, TX
## 3 2016-01-08      AA  N541AA        43    DFW Dallas/Fort Worth, TX
## 4 2016-01-09      AA  N489AA        43    DFW Dallas/Fort Worth, TX
## 5 2016-01-10      AA  N439AA        43    DFW Dallas/Fort Worth, TX
## 6 2016-01-11      AA  N468AA        43    DFW Dallas/Fort Worth, TX
##   OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1          TX  DTW  Detroit, MI        MI       1100    1057      1112
## 2          TX  DTW  Detroit, MI        MI       1100    1056      1110
## 3          TX  DTW  Detroit, MI        MI       1100    1055      1116
## 4          TX  DTW  Detroit, MI        MI       1100    1102      1115
## 5          TX  DTW  Detroit, MI        MI       1100    1240      1300
## 6          TX  DTW  Detroit, MI        MI       1100    1107      1118
##   WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1     1424       1438    1432         0        0            158
## 2     1416       1438    1426         0        0            158
## 3     1431       1438    1445         0        0            158
## 4     1424       1438    1433         0        0            158
## 5     1617       1438    1631         0        0            158
## 6     1426       1438    1435         0        0            158
##   ActualElapsedTime Distance      new_CRSDepTime      new_CRSArrTime
## 1               155      986 2016-01-06 11:00:00 2016-01-06 14:38:00
## 2               150      986 2016-01-07 11:00:00 2016-01-07 14:38:00
## 3               170      986 2016-01-08 11:00:00 2016-01-08 14:38:00
## 4               151      986 2016-01-09 11:00:00 2016-01-09 14:38:00
## 5               171      986 2016-01-10 11:00:00 2016-01-10 14:38:00
## 6               148      986 2016-01-11 11:00:00 2016-01-11 14:38:00
##           new_DepTime       new_WheelsOff        new_WheelsOn
## 1 2016-01-06 10:57:00 2016-01-06 11:12:00 2016-01-06 14:24:00
## 2 2016-01-07 10:56:00 2016-01-07 11:10:00 2016-01-07 14:16:00
## 3 2016-01-08 10:55:00 2016-01-08 11:16:00 2016-01-08 14:31:00
## 4 2016-01-09 11:02:00 2016-01-09 11:15:00 2016-01-09 14:24:00
## 5 2016-01-10 12:40:00 2016-01-10 13:00:00 2016-01-10 16:17:00
## 6 2016-01-11 11:07:00 2016-01-11 11:18:00 2016-01-11 14:26:00
##           new_ArrTime DepDelay DepDelayMinutes DepDel10 TaxiOut TaxiIn
## 1 2016-01-06 14:32:00       -3               0        0      15      8
## 2 2016-01-07 14:26:00       -4               0        0      14     10
## 3 2016-01-08 14:45:00       -5               0        0      21     14
## 4 2016-01-09 14:33:00        2               2        0      13      9
## 5 2016-01-10 16:31:00      100             100        1      20     14
## 6 2016-01-11 14:35:00        7               7        0      11      9
##   ArrDelay ArrDelayMinutes ArrDel10 FlightTimeBuffer AirTime AirSpeed
## 1       -6               0        0                3     132 448.1818
## 2      -12               0        0                8     126 469.5238
## 3        7               7        0              -12     135 438.2222
## 4       -5               0        0                7     129 458.6047
## 5      113             113        1              -13     137 431.8248
## 6       -3               0        0               10     128 462.1875

Visualize data with different variables  .

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggvis)
Domestic2 <- Domestic %>% filter(Cancelled == 0)
Q1 <- Domestic2 %>% group_by(Days = wday(FlightDate, label=TRUE)) %>% 
  mutate(Delayed = ifelse(DepDelayMinutes > 0, 1, 0)) %>% 
  summarize(DelayedPerc = sum(Delayed) / n())
Q1 %>% ggvis(~Days, ~DelayedPerc ,fill:="#922B21") %>% layer_bars() %>% add_axis("x", title = "Dayes") %>% 
add_axis("y", title = "Delay Percentage")
Domestic2 %>% ggvis(~Carrier, ~ArrDel10,fill:="#8E44AD") %>% layer_bars() %>% layer_bars() %>% add_axis("x", title = "Airline Companies") %>% 
add_axis("y", title = "Arrive Delay 10 minutes")
Domestic2 %>% ggvis(~Carrier, ~DepDel10,fill:="#E67E22") %>% layer_bars() %>% layer_bars() %>% add_axis("x", title = "Airline Companies") %>% 
add_axis("y", title = "Departure  Delay 10 minutes")
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:ggvis':
## 
##     resolution
g <- ggplot(Domestic2, aes(Carrier , fill =Carrier ,group = Carrier)) +
geom_bar()
g