R Notebook

This tutorial will be focusing on dplyr package. Dataset used in this section is hflights

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.5

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(hflights)

## Warning: package 'hflights' was built under R version 3.2.5

# Investigate first few rows of dataset

head(hflights)

##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0

It is important to convert this dataset to local dataframe using tbl_df method

df<-tbl_df(hflights)

# They are basically the same, but this  now behaves as local data frame

head(df)

## # A tibble: 6 × 21
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>

Chech the structure of dataset

glimpse(df)

## Observations: 227,496
## Variables: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

filter verb

Select all 1st January of years in dataset

# Using `and` operator

month_day<- df %>% filter(Month==1,DayofMonth==1)

head(month_day)

## # A tibble: 6 × 21
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          1         6     728     840            AA       460
## 3  2011     1          1         6    1631    1736            AA      1121
## 4  2011     1          1         6    1756    2112            AA      1294
## 5  2011     1          1         6    1012    1347            AA      1700
## 6  2011     1          1         6    1211    1325            AA      1820
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>

# Using `or` operator 

month_day1<-df %>% filter(Month==1|DayofMonth==1)

head(month_day1)

## # A tibble: 6 × 21
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>

select verb

Select three columns DayofMonth,UniqueCarrier and TailNum from df dataset

# There are few subverbs associated with `select` verb like "matches, contains,starts_with,ends_with"

col1<-df %>% select(DayofMonth,UniqueCarrier,TailNum)

# Select all columns matching or contains `ar` in column names

col2<-df %>% select(matches("Ar"))

# Select all columns if it contains `Ar`

col3<-df %>% select(TailNum,contains("Ar"))

# Remove desired columns

col4<-df %>% select(-c(Year:FlightNum))

# Select a predefined range of columns

col5<- df %>% select(Year:FlightNum, contains("un"))
# Select a range of columns by indexing or using integer 

col6<-df %>% select(1:5)
# select columns by indexing and using subverb

col7<- df%>% select(1:3,contains("Un"),starts_with("F"))
# Select and order columns 

col8<- df%>% select(1:5) %>% arrange(desc(DepTime))  # from biggest to smallest values

col8<- df%>% select(1:5) %>% arrange(DepTime) # from smallest to biggest values

head(col8)

## # A tibble: 6 × 5
##    Year Month DayofMonth DayOfWeek DepTime
##   <int> <int>      <int>     <int>   <int>
## 1  2011     1          1         6       1
## 2  2011     3         11         5       1
## 3  2011     4         29         5       1
## 4  2011     6         24         5       1
## 5  2011    11         15         2       1
## 6  2011    12         24         6       1

mutate verb, which is used to create a new variable from existing variables

# Calculate speed/km 
mu1<-df%>% select(Distance,ArrTime) %>% mutate(Speed1=Distance/(ArrTime/60))

head(mu1)

## # A tibble: 6 × 3
##   Distance ArrTime   Speed1
##      <int>   <int>    <dbl>
## 1      224    1500 8.960000
## 2      224    1501 8.954031
## 3      224    1502 8.948069
## 4      224    1513 8.883014
## 5      224    1507 8.918381
## 6      224    1503 8.942116

summarise verb

# Summarise or calculate mean ò delay time by Dest

sm1<-df %>% group_by(Dest) %>% summarise(average_delay=mean(ArrDelay,na.rm=T),num_row=n())

# Calculate the avarage time of cancellation for each carrier 
mu2<-df%>% group_by(UniqueCarrier) %>% summarise_each(funs(mean),Cancelled) # Basically no difference from the above-mentioned verb

head(mu2)

## # A tibble: 6 × 2
##   UniqueCarrier   Cancelled
##           <chr>       <dbl>
## 1            AA 0.018495684
## 2            AS 0.000000000
## 3            B6 0.025899281
## 4            CO 0.006782614
## 5            DL 0.015903067
## 6            EV 0.034482759

Another interesting example of summarise_each

# Calculate `min`, `max` of delay by each Carrier
mu3<-df %>% group_by(UniqueCarrier) %>% summarise_each(funs(min(.,na.rm=T),max(.,na.rm=T)),matches("Delay"))

head(mu3)

## # A tibble: 6 × 5
##   UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
##           <chr>        <int>        <int>        <int>        <int>
## 1            AA          -39          -15          978          970
## 2            AS          -43          -15          183          172
## 3            B6          -44          -14          335          310
## 4            CO          -55          -18          957          981
## 5            DL          -32          -17          701          730
## 6            EV          -40          -18          469          479

# Calculate the number of flights departed by month, day of month and Carrier
n1<- df%>% group_by(Month,DayofMonth,UniqueCarrier) %>% summarise(flight_number=n()) %>% arrange()

head(n1)

## Source: local data frame [6 x 4]
## Groups: Month, DayofMonth [1]
## 
##   Month DayofMonth UniqueCarrier flight_number
##   <int>      <int>         <chr>         <int>
## 1     1          1            AA             7
## 2     1          1            AS             1
## 3     1          1            B6             2
## 4     1          1            CO           192
## 5     1          1            DL             4
## 6     1          1            EV             5

# Using distint to calculate the unique number of values in a defined column

dt1<-df %>% group_by(Dest,Month) %>% summarise(Flight_number=n(),Uni_Tail=n_distinct(TailNum))


head(dt1)

## Source: local data frame [6 x 4]
## Groups: Dest [1]
## 
##    Dest Month Flight_number Uni_Tail
##   <chr> <int>         <int>    <int>
## 1   ABQ     1           254      184
## 2   ABQ     2           231      155
## 3   ABQ     3           260      171
## 4   ABQ     4           248      159
## 5   ABQ     5           231      164
## 6   ABQ     6           232      169

For each Carrier, calculate which two days of the year has the loggest departure delay

delay<-df%>% group_by(UniqueCarrier) %>% select(Month,DayofMonth,DepDelay) %>% top_n(2) %>% arrange(UniqueCarrier,desc(DepDelay))

## Adding missing grouping variables: `UniqueCarrier`

## Selecting by DepDelay

head(delay)

## Source: local data frame [6 x 4]
## Groups: UniqueCarrier [3]
## 
##   UniqueCarrier Month DayofMonth DepDelay
##           <chr> <int>      <int>    <int>
## 1            AA    12         12      970
## 2            AA    11         19      677
## 3            AS     2         28      172
## 4            AS     7          6      138
## 5            B6    10         29      310
## 6            B6     8         19      283

Randomly sample a fixed number of rows without replacement

sample1<- df%>% sample_n(5)

# Randomly select a fraction of rows
sample2<-df %>% sample_frac(size=0.1,replace=T)

head(sample2)

## # A tibble: 6 × 21
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     7         12         2    1946    2038            XE      2731
## 2  2011     4          3         7    1534    1727            XE      2479
## 3  2011    11         15         2    1257    1607            CO      1621
## 4  2011    12         31         6    2022    2110            XE      4615
## 5  2011     8         21         7    1557    1735            XE      2320
## 6  2011    10         24         1     737    1045            CO      1161
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>

**Deeper with dplyr package

Select* verb

# Select all columns, except for two columns Year and Month
col8<-df %>% select(-Year,-Month)
# Select all columns, except for columns matching these names

col9<-df %>% select(-contains("Arr"))

head(col9)

## # A tibble: 6 × 18
##    Year Month DayofMonth DayOfWeek DepTime FlightNum TailNum
##   <int> <int>      <int>     <int>   <int>     <int>   <chr>
## 1  2011     1          1         6    1400       428  N576AA
## 2  2011     1          2         7    1401       428  N557AA
## 3  2011     1          3         1    1352       428  N541AA
## 4  2011     1          4         2    1403       428  N403AA
## 5  2011     1          5         3    1405       428  N492AA
## 6  2011     1          6         4    1359       428  N262AA
## # ... with 11 more variables: ActualElapsedTime <int>, AirTime <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

*rename verb

rename1<-df%>% rename(year=Year,month=Month)

head(rename1)

## # A tibble: 6 × 21
##    year month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   <int> <int>      <int>     <int>   <int>   <int>         <chr>     <int>
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## # ... with 13 more variables: TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>

Remove NA values from a certain column

# remove NA values from column "AirTime"
not_NA<-df %>% filter(!is.na(AirTime))
# Filter observations with values between 1200 and 1400

dep1<- df%>% select(1:6) %>% filter(between(DepTime,1200,1400))

head(dep1)

## # A tibble: 6 × 6
##    Year Month DayofMonth DayOfWeek DepTime ArrTime
##   <int> <int>      <int>     <int>   <int>   <int>
## 1  2011     1          1         6    1400    1500
## 2  2011     1          3         1    1352    1502
## 3  2011     1          6         4    1359    1503
## 4  2011     1          7         5    1359    1509
## 5  2011     1          8         6    1355    1454
## 6  2011     1         13         4    1358    1501

slice verb used to filter rows by position

filter1<- df %>% select(1:4) %>% slice(1000:1230)

head(filter1)

## # A tibble: 6 × 4
##    Year Month DayofMonth DayOfWeek
##   <int> <int>      <int>     <int>
## 1  2011     1         28         5
## 2  2011     1         28         5
## 3  2011     1         28         5
## 4  2011     1         28         5
## 5  2011     1         28         5
## 6  2011     1         28         5

sort by rows within each group

filter2<-df%>% group_by(UniqueCarrier,Month,Year) %>% select(UniqueCarrier,Month,Year,AirTime) %>% top_n(3,AirTime) %>% arrange(Month,desc(AirTime))

head(filter2)

## Source: local data frame [6 x 4]
## Groups: UniqueCarrier, Month, Year [2]
## 
##   UniqueCarrier Month  Year AirTime
##           <chr> <int> <int>   <int>
## 1            CO     1  2011     497
## 2            CO     1  2011     496
## 3            CO     1  2011     494
## 4            AS     1  2011     315
## 5            AS     1  2011     310
## 6            AS     1  2011     291

unique and distinct verb used to count the number of unique values in a certain column

dis1<- df %>% select(UniqueCarrier,Dest) %>% unique()

dim(dis1)

## [1] 241   2

# Alternative

dis2<-df %>% select(UniqueCarrier,Dest) %>% distinct()

head(dis2)

## # A tibble: 6 × 2
##   UniqueCarrier  Dest
##           <chr> <chr>
## 1            AA   DFW
## 2            AA   MIA
## 3            AS   SEA
## 4            B6   JFK
## 5            CO   HNL
## 6            CO   MSY

Adding row names

# adding row to column name
mtcar<- mtcars %>% add_rownames("Car_Model")

## Warning: Deprecated, use tibble::rownames_to_column() instead.

head(mtcar)

## # A tibble: 6 × 12
##           Car_Model   mpg   cyl  disp    hp  drat    wt  qsec    vs    am
##               <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1         Mazda RX4  21.0     6   160   110  3.90 2.620 16.46     0     1
## 2     Mazda RX4 Wag  21.0     6   160   110  3.90 2.875 17.02     0     1
## 3        Datsun 710  22.8     4   108    93  3.85 2.320 18.61     1     1
## 4    Hornet 4 Drive  21.4     6   258   110  3.08 3.215 19.44     1     0
## 5 Hornet Sportabout  18.7     8   360   175  3.15 3.440 17.02     0     0
## 6           Valiant  18.1     6   225   105  2.76 3.460 20.22     1     0
## # ... with 2 more variables: gear <dbl>, carb <dbl>

count, tally and n() verbs used to count the number of rows by group

# Count the number of flights by UniqueCarrier

count1<- df%>%  count(UniqueCarrier,sort=T)
# Tally 

count2<-df %>% group_by(UniqueCarrier) %>% tally(sort=T)


head(count2)

## # A tibble: 6 × 2
##   UniqueCarrier     n
##           <chr> <int>
## 1            XE 73053
## 2            CO 70032
## 3            WN 45343
## 4            OO 16061
## 5            MQ  4648
## 6            US  4082

n_groups verb

# Calculate the number of groups

num1<-df %>% group_by(UniqueCarrier) %>% n_groups()

head(num1)

## [1] 15

ungroup and arrange verbs

# This command will group and sort by Month
num2<-df %>% group_by(Month,DayofMonth) %>% summarise(count1=n()) %>% arrange(desc(count1))

head(num2)

## Source: local data frame [6 x 3]
## Groups: Month [1]
## 
##   Month DayofMonth count1
##   <int>      <int>  <int>
## 1     8          4    706
## 2     8         11    706
## 3     8         12    706
## 4     8          5    705
## 5     8          3    704
## 6     8         10    704

# How about sort across all three columns 

num3<-df %>% group_by(Month,DayofMonth) %>% summarise(count1=n()) %>% ungroup() %>% arrange(desc(count1))

head(num3)

## # A tibble: 6 × 3
##   Month DayofMonth count1
##   <int>      <int>  <int>
## 1     8          4    706
## 2     8         11    706
## 3     8         12    706
## 4     8          5    705
## 5     8          3    704
## 6     8         10    704