Import necessary packages and libraries
if(!require(hflights)) install.packages("hflights")
## Loading required package: hflights
suppressMessages(library(hflights))
suppressMessages(library(dplyr))
## Warning: package 'dplyr' was built under R version 4.1.2
Exploring and importing data
data(package="hflights")
data(hflights)
str(hflights)
## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011 1 1 6 1400 1500 AA 428
## 5425 2011 1 2 7 1401 1501 AA 428
## 5426 2011 1 3 1 1352 1502 AA 428
## 5427 2011 1 4 2 1403 1513 AA 428
## 5428 2011 1 5 3 1405 1507 AA 428
## 5429 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 5424 N576AA 60 40 -10 0 IAH DFW 224
## 5425 N557AA 60 45 -9 1 IAH DFW 224
## 5426 N541AA 70 48 -8 -8 IAH DFW 224
## 5427 N403AA 70 39 3 3 IAH DFW 224
## 5428 N492AA 62 44 -3 5 IAH DFW 224
## 5429 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 7 13 0 0
## 5425 6 9 0 0
## 5426 5 17 0 0
## 5427 9 22 0 0
## 5428 9 9 0 0
## 5429 6 13 0 0
Filtering for rows using both indexing and dplyr function methods:
hflights[hflights$Month==1 & hflights$DayofMonth==1, ] %>% head(5)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011 1 1 6 1400 1500 AA 428
## 6343 2011 1 1 6 728 840 AA 460
## 19266 2011 1 1 6 1631 1736 AA 1121
## 23655 2011 1 1 6 1756 2112 AA 1294
## 33051 2011 1 1 6 1012 1347 AA 1700
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 5424 N576AA 60 40 -10 0 IAH DFW 224
## 6343 N520AA 72 41 5 8 IAH DFW 224
## 19266 N4WVAA 65 37 -9 1 IAH DFW 224
## 23655 N3DGAA 136 113 -3 1 IAH MIA 964
## 33051 N3DAAA 155 117 7 -8 IAH MIA 964
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 7 13 0 0
## 6343 6 25 0 0
## 19266 16 12 0 0
## 23655 9 14 0 0
## 33051 12 26 0 0
filter(hflights, Month==1, DayofMonth==1) %>% head(5)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 1 6 728 840 AA 460
## 3 2011 1 1 6 1631 1736 AA 1121
## 4 2011 1 1 6 1756 2112 AA 1294
## 5 2011 1 1 6 1012 1347 AA 1700
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N576AA 60 40 -10 0 IAH DFW 224
## 2 N520AA 72 41 5 8 IAH DFW 224
## 3 N4WVAA 65 37 -9 1 IAH DFW 224
## 4 N3DGAA 136 113 -3 1 IAH MIA 964
## 5 N3DAAA 155 117 7 -8 IAH MIA 964
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1 7 13 0 0
## 2 6 25 0 0
## 3 16 12 0 0
## 4 9 14 0 0
## 5 12 26 0 0
hflights[hflights$Month==1 | hflights$DayofMonth==1, ] %>% head(5)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011 1 1 6 1400 1500 AA 428
## 5425 2011 1 2 7 1401 1501 AA 428
## 5426 2011 1 3 1 1352 1502 AA 428
## 5427 2011 1 4 2 1403 1513 AA 428
## 5428 2011 1 5 3 1405 1507 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 5424 N576AA 60 40 -10 0 IAH DFW 224
## 5425 N557AA 60 45 -9 1 IAH DFW 224
## 5426 N541AA 70 48 -8 -8 IAH DFW 224
## 5427 N403AA 70 39 3 3 IAH DFW 224
## 5428 N492AA 62 44 -3 5 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 7 13 0 0
## 5425 6 9 0 0
## 5426 5 17 0 0
## 5427 9 22 0 0
## 5428 9 9 0 0
Selecting columns using indexing and dplyr methods:
hflights[, c("DepTime", "ArrTime", "FlightNum")] %>% head(5)
## DepTime ArrTime FlightNum
## 5424 1400 1500 428
## 5425 1401 1501 428
## 5426 1352 1502 428
## 5427 1403 1513 428
## 5428 1405 1507 428
select(hflights, DepTime, ArrTime, FlightNum) %>% head(5)
## DepTime ArrTime FlightNum
## 5424 1400 1500 428
## 5425 1401 1501 428
## 5426 1352 1502 428
## 5427 1403 1513 428
## 5428 1405 1507 428
Selecting columns and filtering rows using both methods:
hflights[ifelse(is.na(hflights$DepDelay), 0, hflights$DepDelay) > 60, c('UniqueCarrier', 'DepDelay')] %>% head(5)
## UniqueCarrier DepDelay
## 5440 AA 90
## 5443 AA 67
## 8197 AA 74
## 19274 AA 125
## 19276 AA 82
filter(select(hflights, UniqueCarrier, DepDelay), DepDelay > 60) %>% head(5)
## UniqueCarrier DepDelay
## 1 AA 90
## 2 AA 67
## 3 AA 74
## 4 AA 125
## 5 AA 82
select(hflights, Year:DayofMonth, contains("Taxi"), contains("Delay")) %>% head(5)
## Year Month DayofMonth TaxiIn TaxiOut ArrDelay DepDelay
## 5424 2011 1 1 7 13 -10 0
## 5425 2011 1 2 6 9 -9 1
## 5426 2011 1 3 5 17 -8 -8
## 5427 2011 1 4 9 22 3 3
## 5428 2011 1 5 9 9 -3 5
Arranging by row values
#ascending order
hflights[order(hflights$DepDelay), c("UniqueCarrier", "DepDelay")] %>% head(5)
## UniqueCarrier DepDelay
## 5996719 OO -33
## 927973 MQ -23
## 1694833 XE -19
## 3814017 XE -19
## 83407 CO -18
hflights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(DepDelay) %>% head(5)
## UniqueCarrier DepDelay
## 1 OO -33
## 2 MQ -23
## 3 XE -19
## 4 XE -19
## 5 CO -18
#descending order
hflights[order(hflights$DepDelay, decreasing = TRUE), c("UniqueCarrier", "DepDelay")] %>% head(5)
## UniqueCarrier DepDelay
## 4086711 CO 981
## 5622757 AA 970
## 5457943 MQ 931
## 2843667 UA 869
## 3016449 MQ 814
hflights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay)) %>% head(5)
## UniqueCarrier DepDelay
## 1 CO 981
## 2 AA 970
## 3 MQ 931
## 4 UA 869
## 5 MQ 814
Adding column using two different methods:
hflights$Speed <- hflights$Distance / hflights$AirTime*60
hflights <- hflights %>%
mutate(Speed2 = Distance/AirTime*60)
max(hflights$Speed-hflights$Speed2,na.rm=TRUE)
## [1] 0
Filter data based on multiple criteria using IN clause and select columns:
custom_data <- hflights %>%
filter(UniqueCarrier %in% c('AA','AS','B6')) %>%
select(UniqueCarrier,AirTime,ArrDelay,DepDelay)
#custom_data[custom_data$UniqueCarrier %in% c('AA','AS','B6') & is.na(custom_data$ArrDelay),]
Aggregating data based on Unique Carrier:
custom_data %>% group_by(UniqueCarrier) %>% summarise(n_flights = n())
## # A tibble: 3 x 2
## UniqueCarrier n_flights
## <chr> <int>
## 1 AA 3244
## 2 AS 365
## 3 B6 695
custom_data %>% group_by(UniqueCarrier) %>% summarise(avg_arrDelay = mean(ArrDelay,na.rm = TRUE))
## # A tibble: 3 x 2
## UniqueCarrier avg_arrDelay
## <chr> <dbl>
## 1 AA 0.892
## 2 AS 3.19
## 3 B6 9.86