Import necessary packages and libraries

if(!require(hflights)) install.packages("hflights")
## Loading required package: hflights
suppressMessages(library(hflights))
suppressMessages(library(dplyr))
## Warning: package 'dplyr' was built under R version 4.1.2

Exploring and importing data

data(package="hflights")
data(hflights)
str(hflights)
## 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
head(hflights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011     1          1         6    1400    1500            AA       428
## 5425 2011     1          2         7    1401    1501            AA       428
## 5426 2011     1          3         1    1352    1502            AA       428
## 5427 2011     1          4         2    1403    1513            AA       428
## 5428 2011     1          5         3    1405    1507            AA       428
## 5429 2011     1          6         4    1359    1503            AA       428
##      TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 5424  N576AA                60      40      -10        0    IAH  DFW      224
## 5425  N557AA                60      45       -9        1    IAH  DFW      224
## 5426  N541AA                70      48       -8       -8    IAH  DFW      224
## 5427  N403AA                70      39        3        3    IAH  DFW      224
## 5428  N492AA                62      44       -3        5    IAH  DFW      224
## 5429  N262AA                64      45       -7       -1    IAH  DFW      224
##      TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424      7      13         0                         0
## 5425      6       9         0                         0
## 5426      5      17         0                         0
## 5427      9      22         0                         0
## 5428      9       9         0                         0
## 5429      6      13         0                         0

Filtering for rows using both indexing and dplyr function methods:

hflights[hflights$Month==1 & hflights$DayofMonth==1, ] %>% head(5)
##       Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424  2011     1          1         6    1400    1500            AA       428
## 6343  2011     1          1         6     728     840            AA       460
## 19266 2011     1          1         6    1631    1736            AA      1121
## 23655 2011     1          1         6    1756    2112            AA      1294
## 33051 2011     1          1         6    1012    1347            AA      1700
##       TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 5424   N576AA                60      40      -10        0    IAH  DFW      224
## 6343   N520AA                72      41        5        8    IAH  DFW      224
## 19266  N4WVAA                65      37       -9        1    IAH  DFW      224
## 23655  N3DGAA               136     113       -3        1    IAH  MIA      964
## 33051  N3DAAA               155     117        7       -8    IAH  MIA      964
##       TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424       7      13         0                         0
## 6343       6      25         0                         0
## 19266     16      12         0                         0
## 23655      9      14         0                         0
## 33051     12      26         0                         0
filter(hflights, Month==1, DayofMonth==1) %>% head(5)
##   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011     1          1         6    1400    1500            AA       428
## 2 2011     1          1         6     728     840            AA       460
## 3 2011     1          1         6    1631    1736            AA      1121
## 4 2011     1          1         6    1756    2112            AA      1294
## 5 2011     1          1         6    1012    1347            AA      1700
##   TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1  N576AA                60      40      -10        0    IAH  DFW      224
## 2  N520AA                72      41        5        8    IAH  DFW      224
## 3  N4WVAA                65      37       -9        1    IAH  DFW      224
## 4  N3DGAA               136     113       -3        1    IAH  MIA      964
## 5  N3DAAA               155     117        7       -8    IAH  MIA      964
##   TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1      7      13         0                         0
## 2      6      25         0                         0
## 3     16      12         0                         0
## 4      9      14         0                         0
## 5     12      26         0                         0
hflights[hflights$Month==1 | hflights$DayofMonth==1, ] %>% head(5)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011     1          1         6    1400    1500            AA       428
## 5425 2011     1          2         7    1401    1501            AA       428
## 5426 2011     1          3         1    1352    1502            AA       428
## 5427 2011     1          4         2    1403    1513            AA       428
## 5428 2011     1          5         3    1405    1507            AA       428
##      TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 5424  N576AA                60      40      -10        0    IAH  DFW      224
## 5425  N557AA                60      45       -9        1    IAH  DFW      224
## 5426  N541AA                70      48       -8       -8    IAH  DFW      224
## 5427  N403AA                70      39        3        3    IAH  DFW      224
## 5428  N492AA                62      44       -3        5    IAH  DFW      224
##      TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424      7      13         0                         0
## 5425      6       9         0                         0
## 5426      5      17         0                         0
## 5427      9      22         0                         0
## 5428      9       9         0                         0

Selecting columns using indexing and dplyr methods:

hflights[, c("DepTime", "ArrTime", "FlightNum")] %>% head(5)
##      DepTime ArrTime FlightNum
## 5424    1400    1500       428
## 5425    1401    1501       428
## 5426    1352    1502       428
## 5427    1403    1513       428
## 5428    1405    1507       428
select(hflights, DepTime, ArrTime, FlightNum) %>% head(5)
##      DepTime ArrTime FlightNum
## 5424    1400    1500       428
## 5425    1401    1501       428
## 5426    1352    1502       428
## 5427    1403    1513       428
## 5428    1405    1507       428

Selecting columns and filtering rows using both methods:

hflights[ifelse(is.na(hflights$DepDelay), 0, hflights$DepDelay) > 60, c('UniqueCarrier', 'DepDelay')] %>% head(5)
##       UniqueCarrier DepDelay
## 5440             AA       90
## 5443             AA       67
## 8197             AA       74
## 19274            AA      125
## 19276            AA       82
filter(select(hflights, UniqueCarrier, DepDelay), DepDelay > 60) %>% head(5)
##   UniqueCarrier DepDelay
## 1            AA       90
## 2            AA       67
## 3            AA       74
## 4            AA      125
## 5            AA       82
select(hflights, Year:DayofMonth, contains("Taxi"), contains("Delay")) %>% head(5)
##      Year Month DayofMonth TaxiIn TaxiOut ArrDelay DepDelay
## 5424 2011     1          1      7      13      -10        0
## 5425 2011     1          2      6       9       -9        1
## 5426 2011     1          3      5      17       -8       -8
## 5427 2011     1          4      9      22        3        3
## 5428 2011     1          5      9       9       -3        5

Arranging by row values

#ascending order
hflights[order(hflights$DepDelay), c("UniqueCarrier", "DepDelay")] %>% head(5)
##         UniqueCarrier DepDelay
## 5996719            OO      -33
## 927973             MQ      -23
## 1694833            XE      -19
## 3814017            XE      -19
## 83407              CO      -18
hflights %>%
  select(UniqueCarrier, DepDelay) %>%
  arrange(DepDelay) %>% head(5)
##   UniqueCarrier DepDelay
## 1            OO      -33
## 2            MQ      -23
## 3            XE      -19
## 4            XE      -19
## 5            CO      -18
#descending order
hflights[order(hflights$DepDelay, decreasing = TRUE), c("UniqueCarrier", "DepDelay")] %>% head(5)
##         UniqueCarrier DepDelay
## 4086711            CO      981
## 5622757            AA      970
## 5457943            MQ      931
## 2843667            UA      869
## 3016449            MQ      814
hflights %>%
  select(UniqueCarrier, DepDelay) %>%
  arrange(desc(DepDelay)) %>% head(5)
##   UniqueCarrier DepDelay
## 1            CO      981
## 2            AA      970
## 3            MQ      931
## 4            UA      869
## 5            MQ      814

Adding column using two different methods:

hflights$Speed <- hflights$Distance / hflights$AirTime*60
hflights <- hflights %>%
  mutate(Speed2 = Distance/AirTime*60)
max(hflights$Speed-hflights$Speed2,na.rm=TRUE)
## [1] 0

Filter data based on multiple criteria using IN clause and select columns:

custom_data <- hflights %>%
  filter(UniqueCarrier %in% c('AA','AS','B6')) %>% 
  select(UniqueCarrier,AirTime,ArrDelay,DepDelay)
#custom_data[custom_data$UniqueCarrier %in% c('AA','AS','B6') & is.na(custom_data$ArrDelay),]

Aggregating data based on Unique Carrier:

custom_data %>% group_by(UniqueCarrier) %>% summarise(n_flights = n())
## # A tibble: 3 x 2
##   UniqueCarrier n_flights
##   <chr>             <int>
## 1 AA                 3244
## 2 AS                  365
## 3 B6                  695
custom_data %>% group_by(UniqueCarrier) %>% summarise(avg_arrDelay = mean(ArrDelay,na.rm = TRUE))
## # A tibble: 3 x 2
##   UniqueCarrier avg_arrDelay
##   <chr>                <dbl>
## 1 AA                   0.892
## 2 AS                   3.19 
## 3 B6                   9.86