Data manipulation with dplyr package
*There are few stage in data cleaning process as follow:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
library(ggplot2)
df<-hflights
head(df)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
Tibble is a special type of data frame. The benefit of using tbl_df method can be obvious as it provides the structure of each variable and not massey up on screen if one print the dataset out.
# Converting data frame to tibble
df<-tbl_df(df)
# If one want to convert it back to the orginal data frame
df1<-as.data.frame(df)
# Checking
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
glimpse(df) # alternative
## Observations: 227,496
## Variables: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
head(df)
## # A tibble: 6 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>
# replace carriers name to full name. Using the following way of replacing abbriviations to full name, but this one works only with character structure, not factor.
air_name <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
df$UniqueCarrier<-air_name[df$UniqueCarrier]
head(df)
## # A tibble: 6 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 American 428
## 2 2011 1 2 7 1401 1501 American 428
## 3 2011 1 3 1 1352 1502 American 428
## 4 2011 1 4 2 1403 1513 American 428
## 5 2011 1 5 3 1405 1507 American 428
## 6 2011 1 6 4 1359 1503 American 428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>
select method
# Select the first five rows
df2<-df %>% select(1:5)
head(df2)
## # A tibble: 6 x 5
## Year Month DayofMonth DayOfWeek DepTime
## <int> <int> <int> <int> <int>
## 1 2011 1 1 6 1400
## 2 2011 1 2 7 1401
## 3 2011 1 3 1 1352
## 4 2011 1 4 2 1403
## 5 2011 1 5 3 1405
## 6 2011 1 6 4 1359
# Select all columns, except columns from Year to ArrTime
df3<-df %>% select(-c(Year:ArrTime))
head(df3)
## # A tibble: 6 x 15
## UniqueCarrier FlightNum TailNum ActualElapsedTime AirTime ArrDelay
## <chr> <int> <chr> <int> <int> <int>
## 1 American 428 N576AA 60 40 -10
## 2 American 428 N557AA 60 45 -9
## 3 American 428 N541AA 70 48 -8
## 4 American 428 N403AA 70 39 3
## 5 American 428 N492AA 62 44 -3
## 6 American 428 N262AA 64 45 -7
## # ... with 9 more variables: DepDelay <int>, Origin <chr>, Dest <chr>,
## # Distance <int>, TaxiIn <int>, TaxiOut <int>, Cancelled <int>,
## # CancellationCode <chr>, Diverted <int>
# Select columns containing Arr
df4<-df %>% select(contains("Arr"),contains("N")) # similar applied to `matches`, `ends_with`, `starts_with`
head(df4)
## # A tibble: 6 x 12
## ArrTime UniqueCarrier ArrDelay Month DayofMonth FlightNum TailNum Origin
## <int> <chr> <int> <int> <int> <int> <chr> <chr>
## 1 1500 American -10 1 1 428 N576AA IAH
## 2 1501 American -9 1 2 428 N557AA IAH
## 3 1502 American -8 1 3 428 N541AA IAH
## 4 1513 American 3 1 4 428 N403AA IAH
## 5 1507 American -3 1 5 428 N492AA IAH
## 6 1503 American -7 1 6 428 N262AA IAH
## # ... with 4 more variables: Distance <int>, TaxiIn <int>,
## # Cancelled <int>, CancellationCode <chr>
There are many other functions like mutate, which creates a new variable from existing variables
filter function used to filter out rows
summarise function used to get summary statistics of certain variables in the data frame
# An example of filter function
df_fil<- df %>% filter(UniqueCarrier %in% c("American","Southwest"),!is.na(ArrTime)) # get only values containing `American` and `Southwest`
head(df_fil)
## # A tibble: 6 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 American 428
## 2 2011 1 2 7 1401 1501 American 428
## 3 2011 1 3 1 1352 1502 American 428
## 4 2011 1 4 2 1403 1513 American 428
## 5 2011 1 5 3 1405 1507 American 428
## 6 2011 1 6 4 1359 1503 American 428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>
# Merged date month and year together
df_new<- df %>% mutate(Time=paste(DayofMonth,Month,Year, sep="-"))
head(df_new)
## # A tibble: 6 x 22
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 American 428
## 2 2011 1 2 7 1401 1501 American 428
## 3 2011 1 3 1 1352 1502 American 428
## 4 2011 1 4 2 1403 1513 American 428
## 5 2011 1 5 3 1405 1507 American 428
## 6 2011 1 6 4 1359 1503 American 428
## # ... with 14 more variables: TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>, Time <chr>