Dplyr Package

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.2.5
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.2.5
## Warning: package 'tibble' was built under R version 3.2.5
## Warning: package 'tidyr' was built under R version 3.2.5
## Warning: package 'readr' was built under R version 3.2.5
## Warning: package 'purrr' was built under R version 3.2.5
## Warning: package 'dplyr' was built under R version 3.2.5
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats

*Dataset

# install hflights package as install.packages("hflights")

library(hflights)
## Warning: package 'hflights' was built under R version 3.2.5
# Check few first rows of dataset 

head(hflights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0
# This dataset is in tuple format, convert to data frame : as.data.frame(hflights)
#Check number of rows and columns

dim(hflights)
## [1] 227496     21
#Check variables names

names(hflights)
##  [1] "Year"              "Month"             "DayofMonth"       
##  [4] "DayOfWeek"         "DepTime"           "ArrTime"          
##  [7] "UniqueCarrier"     "FlightNum"         "TailNum"          
## [10] "ActualElapsedTime" "AirTime"           "ArrDelay"         
## [13] "DepDelay"          "Origin"            "Dest"             
## [16] "Distance"          "TaxiIn"            "TaxiOut"          
## [19] "Cancelled"         "CancellationCode"  "Diverted"
# Check the structure of the dataset

glimpse(hflights)
## Observations: 227,496
## Variables: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
# or

str(hflights)
## 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
# rename confusing code "UniqueCarrier" column

lut<-c("AA"="American","AS"="Alaska","B6"="JetBlue","CO"="Continental","DL"="Delta","OO"="SkyWest","AU"="United","US"="US_Airways","WN"="Southwest","EV"="Atlantic_Southeast","F9"="Frontier","FL"="AirTran","MQ"="American_Eagle","XE"="ExpressJet","YV"="Mesa")

# Create a new column

hflights$New_Carrier<-lut[hflights$UniqueCarrier]
# Encode cancellationCode

unique(hflights$CancellationCode)
## [1] ""  "A" "B" "C" "D"
head(hflights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0
##      New_Carrier
## 5424    American
## 5425    American
## 5426    American
## 5427    American
## 5428    American
## 5429    American
# Changing labels of CancellationCode

hflights$CancellationCode<-as.factor(hflights$CancellationCode)

my_flight<-hflights

# Create new column with new name labels

levels(my_flight$CancellationCode)<-c(NA,"Carrier","Weather","National Air System","Security")

tail(my_flight)
##         Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 6083254 2011    12          6         2    1307    1600            WN
## 6083255 2011    12          6         2    1818    2111            WN
## 6083256 2011    12          6         2    2047    2334            WN
## 6083257 2011    12          6         2     912    1031            WN
## 6083258 2011    12          6         2     656     812            WN
## 6083259 2011    12          6         2    1600    1713            WN
##         FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay
## 6083254       471  N632SW               113      98        0        7
## 6083255      1191  N284WN               113      97       -9        8
## 6083256      1674  N366SW               107      94        4        7
## 6083257       127  N777QC                79      61       -4       -3
## 6083258       621  N727SW                76      64      -13       -4
## 6083259      1597  N745SW                73      59      -12        0
##         Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode
## 6083254    HOU  TPA      781      5      10         0             <NA>
## 6083255    HOU  TPA      781      5      11         0             <NA>
## 6083256    HOU  TPA      781      4       9         0             <NA>
## 6083257    HOU  TUL      453      4      14         0             <NA>
## 6083258    HOU  TUL      453      3       9         0             <NA>
## 6083259    HOU  TUL      453      3      11         0             <NA>
##         Diverted New_Carrier
## 6083254        0   Southwest
## 6083255        0   Southwest
## 6083256        0   Southwest
## 6083257        0   Southwest
## 6083258        0   Southwest
## 6083259        0   Southwest

*Select

# Select only four variables below

my_new_variable<-hflights%>%dplyr::select(UniqueCarrier,AirTime,ArrTime,DepDelay)

head(my_new_variable)
##      UniqueCarrier AirTime ArrTime DepDelay
## 5424            AA      40    1500        0
## 5425            AA      45    1501        1
## 5426            AA      48    1502       -8
## 5427            AA      39    1513        3
## 5428            AA      44    1507        5
## 5429            AA      45    1503       -1
# Select all variables, except variable 5 to9

my_variable1<-hflights %>% dplyr::select(-c(5:9))

head(my_variable1)
##      Year Month DayofMonth DayOfWeek ActualElapsedTime AirTime ArrDelay
## 5424 2011     1          1         6                60      40      -10
## 5425 2011     1          2         7                60      45       -9
## 5426 2011     1          3         1                70      48       -8
## 5427 2011     1          4         2                70      39        3
## 5428 2011     1          5         3                62      44       -3
## 5429 2011     1          6         4                64      45       -7
##      DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled
## 5424        0    IAH  DFW      224      7      13         0
## 5425        1    IAH  DFW      224      6       9         0
## 5426       -8    IAH  DFW      224      5      17         0
## 5427        3    IAH  DFW      224      9      22         0
## 5428        5    IAH  DFW      224      9       9         0
## 5429       -1    IAH  DFW      224      6      13         0
##      CancellationCode Diverted New_Carrier
## 5424                         0    American
## 5425                         0    American
## 5426                         0    American
## 5427                         0    American
## 5428                         0    American
## 5429                         0    American
# Select the first four variables

my_first_fourvariable<-hflights %>% dplyr::select(Year:DayOfWeek)

head(my_first_fourvariable)
##      Year Month DayofMonth DayOfWeek
## 5424 2011     1          1         6
## 5425 2011     1          2         7
## 5426 2011     1          3         1
## 5427 2011     1          4         2
## 5428 2011     1          5         3
## 5429 2011     1          6         4
# select all columns end with "Time"

my_time_variable<-hflights %>% dplyr::select(ends_with("Time"))

head(my_time_variable)
##      DepTime ArrTime ActualElapsedTime AirTime
## 5424    1400    1500                60      40
## 5425    1401    1501                60      45
## 5426    1352    1502                70      48
## 5427    1403    1513                70      39
## 5428    1405    1507                62      44
## 5429    1359    1503                64      45
# Can be 'starts_with("T")', 'contains("Time")'

my_combine<-hflights %>% dplyr::select(contains("Taxi"),Distance)
       

head(my_combine)
##      TaxiIn TaxiOut Distance
## 5424      7      13      224
## 5425      6       9      224
## 5426      5      17      224
## 5427      9      22      224
## 5428      9       9      224
## 5429      6      13      224

*Mutate

# Create a avarage speed 

my_speed<-hflights %>% dplyr::mutate(mean_speed=(Distance/AirTime)*60)

# can be creating many new variables at the same time
head(my_speed)
##   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011     1          1         6    1400    1500            AA       428
## 2 2011     1          2         7    1401    1501            AA       428
## 3 2011     1          3         1    1352    1502            AA       428
## 4 2011     1          4         2    1403    1513            AA       428
## 5 2011     1          5         3    1405    1507            AA       428
## 6 2011     1          6         4    1359    1503            AA       428
##   TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1  N576AA                60      40      -10        0    IAH  DFW      224
## 2  N557AA                60      45       -9        1    IAH  DFW      224
## 3  N541AA                70      48       -8       -8    IAH  DFW      224
## 4  N403AA                70      39        3        3    IAH  DFW      224
## 5  N492AA                62      44       -3        5    IAH  DFW      224
## 6  N262AA                64      45       -7       -1    IAH  DFW      224
##   TaxiIn TaxiOut Cancelled CancellationCode Diverted New_Carrier
## 1      7      13         0                         0    American
## 2      6       9         0                         0    American
## 3      5      17         0                         0    American
## 4      9      22         0                         0    American
## 5      9       9         0                         0    American
## 6      6      13         0                         0    American
##   mean_speed
## 1   336.0000
## 2   298.6667
## 3   280.0000
## 4   344.6154
## 5   305.4545
## 6   298.6667

*To be continued

Filter

# find all flights operated by American, Jetblue 

new_flights<- hflights %>% filter(New_Carrier %in% c("American","Delta","JetBlue"))

tail(new_flights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 6575 2011    12         30         5    1554    1849            DL
## 6576 2011    12         30         5     557     856            DL
## 6577 2011    12         31         6    1528    1812            DL
## 6578 2011    12         31         6     709    1005            DL
## 6579 2011    12         31         6    1423    1719            DL
## 6580 2011    12         31         6     556     853            DL
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 6575      1846  N909DL               115      92      -10       -5    HOU
## 6576      2010  N968DL               119      96       -9       -3    IAH
## 6577         8  N376NW               104      90      -21       -4    IAH
## 6578       810  N337NW               116      94       -5        2    IAH
## 6579      1590  N914DL               116      92      278      283    IAH
## 6580      2010  N987DL               117      93      -12       -4    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 6575  ATL      696      9      14         0                         0
## 6576  ATL      689     10      13         0                         0
## 6577  ATL      689      4      10         0                         0
## 6578  ATL      689     11      11         0                         0
## 6579  ATL      689      7      17         0                         0
## 6580  ATL      689      8      16         0                         0
##      New_Carrier
## 6575       Delta
## 6576       Delta
## 6577       Delta
## 6578       Delta
## 6579       Delta
## 6580       Delta
# Create a date column from Year,Month and date columns

new_date<- hflights %>% dplyr::mutate(Date=paste(DayofMonth,Month,Year,sep = "-")) %>% dplyr::select(20:23)

head(new_date)
##   CancellationCode Diverted New_Carrier     Date
## 1                         0    American 1-1-2011
## 2                         0    American 2-1-2011
## 3                         0    American 3-1-2011
## 4                         0    American 4-1-2011
## 5                         0    American 5-1-2011
## 6                         0    American 6-1-2011
# How many airplanes only flew to one destination

only_dest<- hflights %>% group_by(TailNum) %>% summarise(ndest=n_distinct(Dest)) %>% filter(ndest==1) %>% summarise(n_airplane=n())

dim(only_dest)
## [1] 1 1
# Find the most visited destinations for each carrier

most_dest<- hflights%>% group_by(UniqueCarrier,Dest) %>% summarise(n=n()) %>% mutate(rank=rank(desc(n))) %>% filter(rank==1)

dim(most_dest)
## [1] 15  4
head(most_dest)
## Source: local data frame [6 x 4]
## Groups: UniqueCarrier [6]
## 
##   UniqueCarrier  Dest     n  rank
##           <chr> <chr> <int> <dbl>
## 1            AA   DFW  2105     1
## 2            AS   SEA   365     1
## 3            B6   JFK   695     1
## 4            CO   EWR  3924     1
## 5            DL   ATL  2396     1
## 6            EV   DTW   851     1