This tutorial will be focusing on dplyr package. Dataset used in this section is hflights
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
## Warning: package 'hflights' was built under R version 3.2.5
# Investigate first few rows of dataset
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
It is important to convert this dataset to local dataframe using tbl_df method
df<-tbl_df(hflights)
# They are basically the same, but this now behaves as local data frame
head(df)
## # A tibble: 6 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>
Chech the structure of dataset
glimpse(df)
## Observations: 227,496
## Variables: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
filter verbSelect all 1st January of years in dataset
# Using `and` operator
month_day<- df %>% filter(Month==1,DayofMonth==1)
head(month_day)
## # A tibble: 6 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 1 6 728 840 AA 460
## 3 2011 1 1 6 1631 1736 AA 1121
## 4 2011 1 1 6 1756 2112 AA 1294
## 5 2011 1 1 6 1012 1347 AA 1700
## 6 2011 1 1 6 1211 1325 AA 1820
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>
# Using `or` operator
month_day1<-df %>% filter(Month==1|DayofMonth==1)
head(month_day1)
## # A tibble: 6 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>
select verbSelect three columns DayofMonth,UniqueCarrier and TailNum from df dataset
# There are few subverbs associated with `select` verb like "matches, contains,starts_with,ends_with"
col1<-df %>% select(DayofMonth,UniqueCarrier,TailNum)
# Select all columns matching or contains `ar` in column names
col2<-df %>% select(matches("Ar"))
# Select all columns if it contains `Ar`
col3<-df %>% select(TailNum,contains("Ar"))
# Remove desired columns
col4<-df %>% select(-c(Year:FlightNum))
# Select a predefined range of columns
col5<- df %>% select(Year:FlightNum, contains("un"))
# Select a range of columns by indexing or using integer
col6<-df %>% select(1:5)
# select columns by indexing and using subverb
col7<- df%>% select(1:3,contains("Un"),starts_with("F"))
# Select and order columns
col8<- df%>% select(1:5) %>% arrange(desc(DepTime)) # from biggest to smallest values
col8<- df%>% select(1:5) %>% arrange(DepTime) # from smallest to biggest values
head(col8)
## # A tibble: 6 × 5
## Year Month DayofMonth DayOfWeek DepTime
## <int> <int> <int> <int> <int>
## 1 2011 1 1 6 1
## 2 2011 3 11 5 1
## 3 2011 4 29 5 1
## 4 2011 6 24 5 1
## 5 2011 11 15 2 1
## 6 2011 12 24 6 1
mutate verb, which is used to create a new variable from existing variables
# Calculate speed/km
mu1<-df%>% select(Distance,ArrTime) %>% mutate(Speed1=Distance/(ArrTime/60))
head(mu1)
## # A tibble: 6 × 3
## Distance ArrTime Speed1
## <int> <int> <dbl>
## 1 224 1500 8.960000
## 2 224 1501 8.954031
## 3 224 1502 8.948069
## 4 224 1513 8.883014
## 5 224 1507 8.918381
## 6 224 1503 8.942116
summarise verb
# Summarise or calculate mean ò delay time by Dest
sm1<-df %>% group_by(Dest) %>% summarise(average_delay=mean(ArrDelay,na.rm=T),num_row=n())
# Calculate the avarage time of cancellation for each carrier
mu2<-df%>% group_by(UniqueCarrier) %>% summarise_each(funs(mean),Cancelled) # Basically no difference from the above-mentioned verb
head(mu2)
## # A tibble: 6 × 2
## UniqueCarrier Cancelled
## <chr> <dbl>
## 1 AA 0.018495684
## 2 AS 0.000000000
## 3 B6 0.025899281
## 4 CO 0.006782614
## 5 DL 0.015903067
## 6 EV 0.034482759
Another interesting example of summarise_each
# Calculate `min`, `max` of delay by each Carrier
mu3<-df %>% group_by(UniqueCarrier) %>% summarise_each(funs(min(.,na.rm=T),max(.,na.rm=T)),matches("Delay"))
head(mu3)
## # A tibble: 6 × 5
## UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
## <chr> <int> <int> <int> <int>
## 1 AA -39 -15 978 970
## 2 AS -43 -15 183 172
## 3 B6 -44 -14 335 310
## 4 CO -55 -18 957 981
## 5 DL -32 -17 701 730
## 6 EV -40 -18 469 479
# Calculate the number of flights departed by month, day of month and Carrier
n1<- df%>% group_by(Month,DayofMonth,UniqueCarrier) %>% summarise(flight_number=n()) %>% arrange()
head(n1)
## Source: local data frame [6 x 4]
## Groups: Month, DayofMonth [1]
##
## Month DayofMonth UniqueCarrier flight_number
## <int> <int> <chr> <int>
## 1 1 1 AA 7
## 2 1 1 AS 1
## 3 1 1 B6 2
## 4 1 1 CO 192
## 5 1 1 DL 4
## 6 1 1 EV 5
# Using distint to calculate the unique number of values in a defined column
dt1<-df %>% group_by(Dest,Month) %>% summarise(Flight_number=n(),Uni_Tail=n_distinct(TailNum))
head(dt1)
## Source: local data frame [6 x 4]
## Groups: Dest [1]
##
## Dest Month Flight_number Uni_Tail
## <chr> <int> <int> <int>
## 1 ABQ 1 254 184
## 2 ABQ 2 231 155
## 3 ABQ 3 260 171
## 4 ABQ 4 248 159
## 5 ABQ 5 231 164
## 6 ABQ 6 232 169
For each Carrier, calculate which two days of the year has the loggest departure delay
delay<-df%>% group_by(UniqueCarrier) %>% select(Month,DayofMonth,DepDelay) %>% top_n(2) %>% arrange(UniqueCarrier,desc(DepDelay))
## Adding missing grouping variables: `UniqueCarrier`
## Selecting by DepDelay
head(delay)
## Source: local data frame [6 x 4]
## Groups: UniqueCarrier [3]
##
## UniqueCarrier Month DayofMonth DepDelay
## <chr> <int> <int> <int>
## 1 AA 12 12 970
## 2 AA 11 19 677
## 3 AS 2 28 172
## 4 AS 7 6 138
## 5 B6 10 29 310
## 6 B6 8 19 283
Randomly sample a fixed number of rows without replacement
sample1<- df%>% sample_n(5)
# Randomly select a fraction of rows
sample2<-df %>% sample_frac(size=0.1,replace=T)
head(sample2)
## # A tibble: 6 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 7 12 2 1946 2038 XE 2731
## 2 2011 4 3 7 1534 1727 XE 2479
## 3 2011 11 15 2 1257 1607 CO 1621
## 4 2011 12 31 6 2022 2110 XE 4615
## 5 2011 8 21 7 1557 1735 XE 2320
## 6 2011 10 24 1 737 1045 CO 1161
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>
**Deeper with dplyr package
# Select all columns, except for two columns Year and Month
col8<-df %>% select(-Year,-Month)
# Select all columns, except for columns matching these names
col9<-df %>% select(-contains("Arr"))
head(col9)
## # A tibble: 6 × 18
## Year Month DayofMonth DayOfWeek DepTime FlightNum TailNum
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 428 N576AA
## 2 2011 1 2 7 1401 428 N557AA
## 3 2011 1 3 1 1352 428 N541AA
## 4 2011 1 4 2 1403 428 N403AA
## 5 2011 1 5 3 1405 428 N492AA
## 6 2011 1 6 4 1359 428 N262AA
## # ... with 11 more variables: ActualElapsedTime <int>, AirTime <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
*rename verb
rename1<-df%>% rename(year=Year,month=Month)
head(rename1)
## # A tibble: 6 × 21
## year month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>
Remove NA values from a certain column
# remove NA values from column "AirTime"
not_NA<-df %>% filter(!is.na(AirTime))
# Filter observations with values between 1200 and 1400
dep1<- df%>% select(1:6) %>% filter(between(DepTime,1200,1400))
head(dep1)
## # A tibble: 6 × 6
## Year Month DayofMonth DayOfWeek DepTime ArrTime
## <int> <int> <int> <int> <int> <int>
## 1 2011 1 1 6 1400 1500
## 2 2011 1 3 1 1352 1502
## 3 2011 1 6 4 1359 1503
## 4 2011 1 7 5 1359 1509
## 5 2011 1 8 6 1355 1454
## 6 2011 1 13 4 1358 1501
slice verb used to filter rows by positionfilter1<- df %>% select(1:4) %>% slice(1000:1230)
head(filter1)
## # A tibble: 6 × 4
## Year Month DayofMonth DayOfWeek
## <int> <int> <int> <int>
## 1 2011 1 28 5
## 2 2011 1 28 5
## 3 2011 1 28 5
## 4 2011 1 28 5
## 5 2011 1 28 5
## 6 2011 1 28 5
filter2<-df%>% group_by(UniqueCarrier,Month,Year) %>% select(UniqueCarrier,Month,Year,AirTime) %>% top_n(3,AirTime) %>% arrange(Month,desc(AirTime))
head(filter2)
## Source: local data frame [6 x 4]
## Groups: UniqueCarrier, Month, Year [2]
##
## UniqueCarrier Month Year AirTime
## <chr> <int> <int> <int>
## 1 CO 1 2011 497
## 2 CO 1 2011 496
## 3 CO 1 2011 494
## 4 AS 1 2011 315
## 5 AS 1 2011 310
## 6 AS 1 2011 291
unique and distinct verb used to count the number of unique values in a certain columndis1<- df %>% select(UniqueCarrier,Dest) %>% unique()
dim(dis1)
## [1] 241 2
# Alternative
dis2<-df %>% select(UniqueCarrier,Dest) %>% distinct()
head(dis2)
## # A tibble: 6 × 2
## UniqueCarrier Dest
## <chr> <chr>
## 1 AA DFW
## 2 AA MIA
## 3 AS SEA
## 4 B6 JFK
## 5 CO HNL
## 6 CO MSY
# adding row to column name
mtcar<- mtcars %>% add_rownames("Car_Model")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
head(mtcar)
## # A tibble: 6 × 12
## Car_Model mpg cyl disp hp drat wt qsec vs am
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1
## 2 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1
## 3 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1
## 4 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0
## 5 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0
## 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0
## # ... with 2 more variables: gear <dbl>, carb <dbl>
count, tally and n() verbs used to count the number of rows by group# Count the number of flights by UniqueCarrier
count1<- df%>% count(UniqueCarrier,sort=T)
# Tally
count2<-df %>% group_by(UniqueCarrier) %>% tally(sort=T)
head(count2)
## # A tibble: 6 × 2
## UniqueCarrier n
## <chr> <int>
## 1 XE 73053
## 2 CO 70032
## 3 WN 45343
## 4 OO 16061
## 5 MQ 4648
## 6 US 4082
n_groups verb
# Calculate the number of groups
num1<-df %>% group_by(UniqueCarrier) %>% n_groups()
head(num1)
## [1] 15
ungroup and arrange verbs# This command will group and sort by Month
num2<-df %>% group_by(Month,DayofMonth) %>% summarise(count1=n()) %>% arrange(desc(count1))
head(num2)
## Source: local data frame [6 x 3]
## Groups: Month [1]
##
## Month DayofMonth count1
## <int> <int> <int>
## 1 8 4 706
## 2 8 11 706
## 3 8 12 706
## 4 8 5 705
## 5 8 3 704
## 6 8 10 704
# How about sort across all three columns
num3<-df %>% group_by(Month,DayofMonth) %>% summarise(count1=n()) %>% ungroup() %>% arrange(desc(count1))
head(num3)
## # A tibble: 6 × 3
## Month DayofMonth count1
## <int> <int> <int>
## 1 8 4 706
## 2 8 11 706
## 3 8 12 706
## 4 8 5 705
## 5 8 3 704
## 6 8 10 704