Data manipulation with dplyr package

*There are few stage in data cleaning process as follow:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(hflights)

library(ggplot2)

df<-hflights

head(df)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0

Tibble is a special type of data frame. The benefit of using tbl_df method can be obvious as it provides the structure of each variable and not massey up on screen if one print the dataset out.

# Converting data frame to tibble 

df<-tbl_df(df)

# If one want to convert it back to the orginal data frame

df1<-as.data.frame(df)
# Checking 
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
glimpse(df) # alternative
## Observations: 227,496
## Variables: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
head(df)
## # A tibble: 6 x 21
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>
# replace carriers name to full name. Using the following way of replacing abbriviations to full name, but this one works only with character structure, not factor.
air_name <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental", 
         "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways", 
         "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier", 
         "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")

df$UniqueCarrier<-air_name[df$UniqueCarrier]

head(df)
## # A tibble: 6 x 21
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500      American       428
## 2  2011     1          2         7    1401    1501      American       428
## 3  2011     1          3         1    1352    1502      American       428
## 4  2011     1          4         2    1403    1513      American       428
## 5  2011     1          5         3    1405    1507      American       428
## 6  2011     1          6         4    1359    1503      American       428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>

select method

# Select the first five rows

df2<-df %>% select(1:5)

head(df2)
## # A tibble: 6 x 5
##    Year Month DayofMonth DayOfWeek DepTime
##   <int> <int>      <int>     <int>   <int>
## 1  2011     1          1         6    1400
## 2  2011     1          2         7    1401
## 3  2011     1          3         1    1352
## 4  2011     1          4         2    1403
## 5  2011     1          5         3    1405
## 6  2011     1          6         4    1359
# Select all columns, except columns from Year to ArrTime 

df3<-df %>% select(-c(Year:ArrTime))

head(df3)
## # A tibble: 6 x 15
##   UniqueCarrier FlightNum TailNum ActualElapsedTime AirTime ArrDelay
##           <chr>     <int>   <chr>             <int>   <int>    <int>
## 1      American       428  N576AA                60      40      -10
## 2      American       428  N557AA                60      45       -9
## 3      American       428  N541AA                70      48       -8
## 4      American       428  N403AA                70      39        3
## 5      American       428  N492AA                62      44       -3
## 6      American       428  N262AA                64      45       -7
## # ... with 9 more variables: DepDelay <int>, Origin <chr>, Dest <chr>,
## #   Distance <int>, TaxiIn <int>, TaxiOut <int>, Cancelled <int>,
## #   CancellationCode <chr>, Diverted <int>
# Select columns containing Arr

df4<-df %>% select(contains("Arr"),contains("N")) # similar applied to `matches`, `ends_with`, `starts_with`

head(df4)
## # A tibble: 6 x 12
##   ArrTime UniqueCarrier ArrDelay Month DayofMonth FlightNum TailNum Origin
##     <int>         <chr>    <int> <int>      <int>     <int>   <chr>  <chr>
## 1    1500      American      -10     1          1       428  N576AA    IAH
## 2    1501      American       -9     1          2       428  N557AA    IAH
## 3    1502      American       -8     1          3       428  N541AA    IAH
## 4    1513      American        3     1          4       428  N403AA    IAH
## 5    1507      American       -3     1          5       428  N492AA    IAH
## 6    1503      American       -7     1          6       428  N262AA    IAH
## # ... with 4 more variables: Distance <int>, TaxiIn <int>,
## #   Cancelled <int>, CancellationCode <chr>

There are many other functions like mutate, which creates a new variable from existing variables

filter function used to filter out rows

summarise function used to get summary statistics of certain variables in the data frame

# An example of filter function

df_fil<- df %>% filter(UniqueCarrier %in% c("American","Southwest"),!is.na(ArrTime)) # get only values containing `American` and `Southwest`

head(df_fil)
## # A tibble: 6 x 21
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500      American       428
## 2  2011     1          2         7    1401    1501      American       428
## 3  2011     1          3         1    1352    1502      American       428
## 4  2011     1          4         2    1403    1513      American       428
## 5  2011     1          5         3    1405    1507      American       428
## 6  2011     1          6         4    1359    1503      American       428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>
# Merged date month and year together
df_new<- df %>% mutate(Time=paste(DayofMonth,Month,Year, sep="-"))

head(df_new)
## # A tibble: 6 x 22
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500      American       428
## 2  2011     1          2         7    1401    1501      American       428
## 3  2011     1          3         1    1352    1502      American       428
## 4  2011     1          4         2    1403    1513      American       428
## 5  2011     1          5         3    1405    1507      American       428
## 6  2011     1          6         4    1359    1503      American       428
## # ... with 14 more variables: TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>, Time <chr>