library(dplyr)if (!require(hflights)) install.packages(hflights)
library(hflights)if (!require(knitr)) install.packages(knitr)
knitr::kable(head(hflights))| Year | Month | DayofMonth | DayOfWeek | DepTime | ArrTime | UniqueCarrier | FlightNum | TailNum | ActualElapsedTime | AirTime | ArrDelay | DepDelay | Origin | Dest | Distance | TaxiIn | TaxiOut | Cancelled | CancellationCode | Diverted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5424 | 2011 | 1 | 1 | 6 | 1400 | 1500 | AA | 428 | N576AA | 60 | 40 | -10 | 0 | IAH | DFW | 224 | 7 | 13 | 0 | 0 | |
| 5425 | 2011 | 1 | 2 | 7 | 1401 | 1501 | AA | 428 | N557AA | 60 | 45 | -9 | 1 | IAH | DFW | 224 | 6 | 9 | 0 | 0 | |
| 5426 | 2011 | 1 | 3 | 1 | 1352 | 1502 | AA | 428 | N541AA | 70 | 48 | -8 | -8 | IAH | DFW | 224 | 5 | 17 | 0 | 0 | |
| 5427 | 2011 | 1 | 4 | 2 | 1403 | 1513 | AA | 428 | N403AA | 70 | 39 | 3 | 3 | IAH | DFW | 224 | 9 | 22 | 0 | 0 | |
| 5428 | 2011 | 1 | 5 | 3 | 1405 | 1507 | AA | 428 | N492AA | 62 | 44 | -3 | 5 | IAH | DFW | 224 | 9 | 9 | 0 | 0 | |
| 5429 | 2011 | 1 | 6 | 4 | 1359 | 1503 | AA | 428 | N262AA | 64 | 45 | -7 | -1 | IAH | DFW | 224 | 6 | 13 | 0 | 0 |
summary(hflights)## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:227496 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 Class :character 1st Qu.: 855
## Median :1416 Median :1617 Mode :character Median :1696
## Mean :1396 Mean :1578 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :2400 Max. :7290
## NA's :2905 NA's :3066
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:227496 Min. : 34.0 Min. : 11.0 Min. :-70.000
## Class :character 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## Mode :character Median :128.0 Median :107.0 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## Max. :575.0 Max. :549.0 Max. :978.000
## NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 Length:227496 Length:227496 Min. : 79.0
## 1st Qu.: -3.000 Class :character Class :character 1st Qu.: 376.0
## Median : 0.000 Mode :character Mode :character Median : 809.0
## Mean : 9.445 Mean : 787.8
## 3rd Qu.: 9.000 3rd Qu.:1042.0
## Max. :981.000 Max. :3904.0
## NA's :2905
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Length:227496
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 Class :character
## Median : 5.000 Median : 14.00 Median :0.00000 Mode :character
## Mean : 6.099 Mean : 15.09 Mean :0.01307
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
tbl_df(head(hflights))## Source: local data frame [6 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## Variables not shown: TailNum <chr>, ActualElapsedTime <int>, AirTime
## <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>,
## Distance <int>, TaxiIn <int>, TaxiOut <int>, Cancelled <int>,
## CancellationCode <chr>, Diverted <int>.
it automatically adjust the variables to show according to window size
glimpse(head(hflights))## Observations: 6
## Variables: 21
## $ Year (int) 2011, 2011, 2011, 2011, 2011, 2011
## $ Month (int) 1, 1, 1, 1, 1, 1
## $ DayofMonth (int) 1, 2, 3, 4, 5, 6
## $ DayOfWeek (int) 6, 7, 1, 2, 3, 4
## $ DepTime (int) 1400, 1401, 1352, 1403, 1405, 1359
## $ ArrTime (int) 1500, 1501, 1502, 1513, 1507, 1503
## $ UniqueCarrier (chr) "AA", "AA", "AA", "AA", "AA", "AA"
## $ FlightNum (int) 428, 428, 428, 428, 428, 428
## $ TailNum (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64
## $ AirTime (int) 40, 45, 48, 39, 44, 45
## $ ArrDelay (int) -10, -9, -8, 3, -3, -7
## $ DepDelay (int) 0, 1, -8, 3, 5, -1
## $ Origin (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH"
## $ Dest (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW"
## $ Distance (int) 224, 224, 224, 224, 224, 224
## $ TaxiIn (int) 7, 6, 5, 9, 9, 6
## $ TaxiOut (int) 13, 9, 17, 22, 9, 13
## $ Cancelled (int) 0, 0, 0, 0, 0, 0
## $ CancellationCode (chr) "", "", "", "", "", ""
## $ Diverted (int) 0, 0, 0, 0, 0, 0
hflights <- tbl_df(hflights)head(hflights)## Source: local data frame [6 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## Variables not shown: TailNum <chr>, ActualElapsedTime <int>, AirTime
## <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>,
## Distance <int>, TaxiIn <int>, TaxiOut <int>, Cancelled <int>,
## CancellationCode <chr>, Diverted <int>.
carriers <- hflights$UniqueCarrierhead(carriers)## [1] "AA" "AA" "AA" "AA" "AA" "AA"
summary(carriers)## Length Class Mode
## 227496 character character
str(carriers)## chr [1:227496] "AA" "AA" "AA" "AA" "AA" "AA" "AA" ...
class(hflights)## [1] "tbl_df" "tbl" "data.frame"
You can create a lookup table with a named vector.
When you subset the lookup table with a character string (like the character strings in UniqueCarrier), R will return the values of the lookup table that correspond to the names in the character string.
To see how this works:
one <- "AA"
lut <- c("AA" = "American",
"AS" = "Alaska",
"B6" = "JetBlue")
one <- lut[one]
one## AA
## "American"
two <- c("AA", "AS")
lut <- c("AA" = "American",
"AS" = "Alaska",
"B6" = "JetBlue")
two <- lut[two]
two## AA AS
## "American" "Alaska"
three <- c("AA", "AS", "B6")
lut <- c("AA" = "American",
"AS" = "Alaska",
"B6" = "JetBlue")
three <- lut[three]
three## AA AS B6
## "American" "Alaska" "JetBlue"
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")hflights$UniqueCarriers <- lut[hflights$UniqueCarrier]glimpse(hflights)## Observations: 227,496
## Variables: 22
## $ Year (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode (chr) "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ UniqueCarriers (chr) "American", "American", "American", "America...
Similarly, this time to change the labels in the CancellationCode column.
This column lists reasons why a flight was cancelled using a non-informative alphabetical code.
lut <- c("A" = "carrier", "B" = "weather", "C" = "FFA", "D" = "security", "E" = "not cancelled")hflights$CancellationCode <- lut[hflights$CancellationCode]glimpse(hflights)## Observations: 227,496
## Variables: 22
## $ Year (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Diverted (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ UniqueCarriers (chr) "American", "American", "American", "America...