Load the dplyr package

library(dplyr)

Load the hflights package

if (!require(hflights)) install.packages(hflights)
library(hflights)

View first few rows of data on hflights

if (!require(knitr)) install.packages(knitr)
knitr::kable(head(hflights))
Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
5424 2011 1 1 6 1400 1500 AA 428 N576AA 60 40 -10 0 IAH DFW 224 7 13 0 0
5425 2011 1 2 7 1401 1501 AA 428 N557AA 60 45 -9 1 IAH DFW 224 6 9 0 0
5426 2011 1 3 1 1352 1502 AA 428 N541AA 70 48 -8 -8 IAH DFW 224 5 17 0 0
5427 2011 1 4 2 1403 1513 AA 428 N403AA 70 39 3 3 IAH DFW 224 9 22 0 0
5428 2011 1 5 3 1405 1507 AA 428 N492AA 62 44 -3 5 IAH DFW 224 9 9 0 0
5429 2011 1 6 4 1359 1503 AA 428 N262AA 64 45 -7 -1 IAH DFW 224 6 13 0 0

summarise the data on hflights

summary(hflights)
##       Year          Month          DayofMonth      DayOfWeek    
##  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000  
##  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000  
##  Median :2011   Median : 7.000   Median :16.00   Median :4.000  
##  Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948  
##  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000  
##                                                                 
##     DepTime        ArrTime     UniqueCarrier        FlightNum   
##  Min.   :   1   Min.   :   1   Length:227496      Min.   :   1  
##  1st Qu.:1021   1st Qu.:1215   Class :character   1st Qu.: 855  
##  Median :1416   Median :1617   Mode  :character   Median :1696  
##  Mean   :1396   Mean   :1578                      Mean   :1962  
##  3rd Qu.:1801   3rd Qu.:1953                      3rd Qu.:2755  
##  Max.   :2400   Max.   :2400                      Max.   :7290  
##  NA's   :2905   NA's   :3066                                    
##    TailNum          ActualElapsedTime    AirTime         ArrDelay      
##  Length:227496      Min.   : 34.0     Min.   : 11.0   Min.   :-70.000  
##  Class :character   1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000  
##  Mode  :character   Median :128.0     Median :107.0   Median :  0.000  
##                     Mean   :129.3     Mean   :108.1   Mean   :  7.094  
##                     3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000  
##                     Max.   :575.0     Max.   :549.0   Max.   :978.000  
##                     NA's   :3622      NA's   :3622    NA's   :3622     
##     DepDelay          Origin              Dest              Distance     
##  Min.   :-33.000   Length:227496      Length:227496      Min.   :  79.0  
##  1st Qu.: -3.000   Class :character   Class :character   1st Qu.: 376.0  
##  Median :  0.000   Mode  :character   Mode  :character   Median : 809.0  
##  Mean   :  9.445                                         Mean   : 787.8  
##  3rd Qu.:  9.000                                         3rd Qu.:1042.0  
##  Max.   :981.000                                         Max.   :3904.0  
##  NA's   :2905                                                            
##      TaxiIn           TaxiOut         Cancelled       CancellationCode  
##  Min.   :  1.000   Min.   :  1.00   Min.   :0.00000   Length:227496     
##  1st Qu.:  4.000   1st Qu.: 10.00   1st Qu.:0.00000   Class :character  
##  Median :  5.000   Median : 14.00   Median :0.00000   Mode  :character  
##  Mean   :  6.099   Mean   : 15.09   Mean   :0.01307                     
##  3rd Qu.:  7.000   3rd Qu.: 18.00   3rd Qu.:0.00000                     
##  Max.   :165.000   Max.   :163.00   Max.   :1.00000                     
##  NA's   :3066      NA's   :2947                                         
##     Diverted       
##  Min.   :0.000000  
##  1st Qu.:0.000000  
##  Median :0.000000  
##  Mean   :0.002853  
##  3rd Qu.:0.000000  
##  Max.   :1.000000  
## 

view the data on hflights using dplyr function ‘tbl’

tbl_df(head(hflights))
## Source: local data frame [6 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## Variables not shown: TailNum <chr>, ActualElapsedTime <int>, AirTime
##   <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>,
##   Distance <int>, TaxiIn <int>, TaxiOut <int>, Cancelled <int>,
##   CancellationCode <chr>, Diverted <int>.
  it automatically adjust the variables to show according to window size

summarise the data on hflights using dplyr function ‘glimpse’

glimpse(head(hflights))
## Observations: 6
## Variables: 21
## $ Year              (int) 2011, 2011, 2011, 2011, 2011, 2011
## $ Month             (int) 1, 1, 1, 1, 1, 1
## $ DayofMonth        (int) 1, 2, 3, 4, 5, 6
## $ DayOfWeek         (int) 6, 7, 1, 2, 3, 4
## $ DepTime           (int) 1400, 1401, 1352, 1403, 1405, 1359
## $ ArrTime           (int) 1500, 1501, 1502, 1513, 1507, 1503
## $ UniqueCarrier     (chr) "AA", "AA", "AA", "AA", "AA", "AA"
## $ FlightNum         (int) 428, 428, 428, 428, 428, 428
## $ TailNum           (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64
## $ AirTime           (int) 40, 45, 48, 39, 44, 45
## $ ArrDelay          (int) -10, -9, -8, 3, -3, -7
## $ DepDelay          (int) 0, 1, -8, 3, 5, -1
## $ Origin            (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH"
## $ Dest              (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW"
## $ Distance          (int) 224, 224, 224, 224, 224, 224
## $ TaxiIn            (int) 7, 6, 5, 9, 9, 6
## $ TaxiOut           (int) 13, 9, 17, 22, 9, 13
## $ Cancelled         (int) 0, 0, 0, 0, 0, 0
## $ CancellationCode  (chr) "", "", "", "", "", ""
## $ Diverted          (int) 0, 0, 0, 0, 0, 0

Convert the hflights data.frame into a hflights tbl

hflights <- tbl_df(hflights)

Display the type of newly created ‘hflights’ dataframe

head(hflights)
## Source: local data frame [6 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## Variables not shown: TailNum <chr>, ActualElapsedTime <int>, AirTime
##   <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>,
##   Distance <int>, TaxiIn <int>, TaxiOut <int>, Cancelled <int>,
##   CancellationCode <chr>, Diverted <int>.

Create the object carriers

carriers <- hflights$UniqueCarrier

Summarize the Object Carrier

Head of Carrier

head(carriers)
## [1] "AA" "AA" "AA" "AA" "AA" "AA"

Summary of Carrier

summary(carriers)
##    Length     Class      Mode 
##    227496 character character

Structure of Carrier

str(carriers)
##  chr [1:227496] "AA" "AA" "AA" "AA" "AA" "AA" "AA" ...

Assert data type of new data frame

class(hflights)
## [1] "tbl_df"     "tbl"        "data.frame"

Example of Look Up Table

You can create a lookup table with a named vector.

When you subset the lookup table with a character string (like the character strings in UniqueCarrier), R will return the values of the lookup table that correspond to the names in the character string.

To see how this works:

Example 1

one <- "AA"

lut <- c("AA" = "American", 
         "AS" = "Alaska", 
         "B6" = "JetBlue")

one <- lut[one]

one
##         AA 
## "American"

Example 2

two <- c("AA", "AS")

lut <- c("AA" = "American", 
         "AS" = "Alaska", 
         "B6" = "JetBlue")

two <- lut[two]

two
##         AA         AS 
## "American"   "Alaska"

Example 3

three <- c("AA", "AS", "B6")

lut <- c("AA" = "American", 
         "AS" = "Alaska", 
         "B6" = "JetBlue")

three <- lut[three]

three
##         AA         AS         B6 
## "American"   "Alaska"  "JetBlue"

Changing labels of hflights

Both the dplyr and hflights packages are loaded into workspace

lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental", 
         "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways", 
         "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier", 
         "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")

Use lut to translate the UniqueCarrier column of hflights

hflights$UniqueCarriers <- lut[hflights$UniqueCarrier]

Inspect the resulting raw values of your variables

glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year              (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month             (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth        (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek         (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime           (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime           (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier     (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum         (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum           (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime           (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay          (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay          (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin            (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest              (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance          (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn            (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut           (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled         (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode  (chr) "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted          (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ UniqueCarriers    (chr) "American", "American", "American", "America...

Similarly, this time to change the labels in the CancellationCode column.

This column lists reasons why a flight was cancelled using a non-informative alphabetical code.

Changing labels of hflights - 2

The lookup table

lut <- c("A" = "carrier", "B" = "weather", "C" = "FFA", "D" = "security", "E" = "not cancelled")

Recode the CancellationCode column with lut

hflights$CancellationCode <- lut[hflights$CancellationCode]

Glimpse at hflights

glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year              (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month             (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth        (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek         (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime           (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime           (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier     (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum         (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum           (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime           (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay          (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay          (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin            (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest              (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance          (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn            (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut           (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled         (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode  (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Diverted          (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ UniqueCarriers    (chr) "American", "American", "American", "America...