R for Data Science

0.1 R Basics
0.2 DPLYR

0.1 R Basics

0.1.1 Load the libraries which are required

library(dplyr)
library(ggplot2)

0.1.2 R Arithmetic

1 + 2             # addition

## [1] 3

5 - 3             # subtraction

## [1] 2

4 / 1             # division

## [1] 4

3 ^ 2             # exponential

## [1] 9

7 %% 2            # Modulo

## [1] 1

(50*2) + (50/2)   # BODMAS

## [1] 125

0.1.3 Comparison Operators

1 > 2    # will return boolean (false)

## [1] FALSE

7 > 4    # will return boolean (true)

## [1] TRUE

5>=5     # greater than equal to

## [1] TRUE

1>=2

## [1] FALSE

3<=2     # less than equal to

## [1] FALSE

2<=2

## [1] TRUE

5 != 2   # Not equal to

## [1] TRUE

7 != 7

## [1] FALSE

5 == 5   # Equal to

## [1] TRUE

2 == 3

## [1] FALSE

v1 <- c(1,2,3)    # creates a numeric vector
v2 <- c(11,22,33) # creates a numeric vector

v1 < v2   # returns a boolean vector

## [1] TRUE TRUE TRUE

0.1.4 Help and assistance in R

help(vector)       #help function is good for any help and documentation 
?data.frame      # performs the same as help function

0.1.5 R Data types

n <- 3.14   # numeric class (floating point values)
i <- 10     # integer class
t <- TRUE   # Logical class
f <- FALSE  # Logical class

ch <- "R for Data Science"  # character class (can also use single quotes)

class(n)                    # class function is used to know the data type class

## [1] "numeric"

class(i)

## [1] "numeric"

class(t)

## [1] "logical"

class(f)

## [1] "logical"

class(ch)

## [1] "character"

0.1.6 Vectors basics

nvec <- c(10,20,30,40)  # creates a numeric vector
cvec <- c('A','B','C')  # creates a character vector
lvec <- c(TRUE,FALSE)   # creates a logical vector

class(nvec) # check the classes

## [1] "numeric"

class(cvec)

## [1] "character"

class(lvec)

## [1] "logical"

temps <- c(55,67,74,68,65,77,81)
names(temps) <- c('Mon','Tue','Wed','Thu','Fri','Sat','Sun')  # names function is used to give names to the vector
temps

## Mon Tue Wed Thu Fri Sat Sun 
##  55  67  74  68  65  77  81

0.1.7 Vectors indexing and slicing

v1 <- c(100,200,300)
v2 <- c('a','b','c')

v1[2]  # get the second element

## [1] 200

v2[2]

## [1] "b"

v1[c(1,2)] # get multiple elements

## [1] 100 200

v2[c(1,3)]

## [1] "a" "c"

v <- c(1,2,3,4,5,6,7,8,9,10) # slicing example
v[2:4]                       # get the 2nd,3rd and 4th element

## [1] 2 3 4

v[5:8]                       # get 5-8 elements

## [1] 5 6 7 8

names(v) <- c('a','b','c','d','e','f','g','h','i','j')    # indexing with names 
v['c']                                                    # get the value corresponding to c

## c 
## 3

v['i']                                                    # get the value corresponding to i

## i 
## 9

v[v>2] # indexing using comparison operations

##  c  d  e  f  g  h  i  j 
##  3  4  5  6  7  8  9 10

0.1.8 Vector operations

v1 <- c(10,20,30)  # create sample vectos
v2 <- c(50,60,70)

# vector arithmetic
v1 + v2     # addition

## [1]  60  80 100

v2 - v1     # subtraction

## [1] 40 40 40

v1 * v2     # multiplication

## [1]  500 1200 2100

v2 / v1     # division

## [1] 5.000000 3.000000 2.333333

# functions
sum(v1)     # add all the elements on the vector

## [1] 60

sd(v2)      # standard deviation

## [1] 10

var(v2)     # variance

## [1] 100

max(v1)     # maximum value in the vector

## [1] 30

min(v2)     # minimum value in the vector

## [1] 50

prod(v1)    # product of all the elements in the vector

## [1] 6000

0.1.9 Dataframes overview and operations

empty_dataframe <- data.frame() # creates an empty data frame
c1 <- 1:10
c2 <- 20:29
df <- data.frame(c1,c2)
df

##    c1 c2
## 1   1 20
## 2   2 21
## 3   3 22
## 4   4 23
## 5   5 24
## 6   6 25
## 7   7 26
## 8   8 27
## 9   9 28
## 10 10 29

# the following code reads a csv file using the function read.csv() and store it as a dataframe.

#df2 <- read.csv('sample.csv',header = T,stringsAsFactors = F) 

# the following code reads an excel file using the function read_excel() and store it as a dataframe. To run this code first you also need to import the readxl package.

library(readxl)
#df3 <- read_excel('sample.xlsx',sheet = 'Sheet 1')

nrow(df)       # number of rows in dataframe

## [1] 10

ncol(df)       # number of cols in dataframe

## [1] 2

colnames(df)   # column names of dataframe

## [1] "c1" "c2"

rownames(df)   # row names of dataframe

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"

head(df,n=5)   # shows the first few rows

##   c1 c2
## 1  1 20
## 2  2 21
## 3  3 22
## 4  4 23
## 5  5 24

tail(df,n=5)   # shows the last few rows

##    c1 c2
## 6   6 25
## 7   7 26
## 8   8 27
## 9   9 28
## 10 10 29

str(df)        # structure of dataframe

## 'data.frame':    10 obs. of  2 variables:
##  $ c1: int  1 2 3 4 5 6 7 8 9 10
##  $ c2: int  20 21 22 23 24 25 26 27 28 29

summary(df)    # gives summary statistic

##        c1              c2       
##  Min.   : 1.00   Min.   :20.00  
##  1st Qu.: 3.25   1st Qu.:22.25  
##  Median : 5.50   Median :24.50  
##  Mean   : 5.50   Mean   :24.50  
##  3rd Qu.: 7.75   3rd Qu.:26.75  
##  Max.   :10.00   Max.   :29.00

new_row <- c(11,30)
df <- rbind(df,new_row) # use rbind to add new rows to existing dataframe
df

##    c1 c2
## 1   1 20
## 2   2 21
## 3   3 22
## 4   4 23
## 5   5 24
## 6   6 25
## 7   7 26
## 8   8 27
## 9   9 28
## 10 10 29
## 11 11 30

df_new <- df
new_col <- 41:51
df_new <- cbind(df_new,new_col)  # use cbind to add new columns to existing dataframe
df_new

##    c1 c2 new_col
## 1   1 20      41
## 2   2 21      42
## 3   3 22      43
## 4   4 23      44
## 5   5 24      45
## 6   6 25      46
## 7   7 26      47
## 8   8 27      48
## 9   9 28      49
## 10 10 29      50
## 11 11 30      51

any(is.na(df))       # to find if there are na values in the dataframe

## [1] FALSE

any(is.na(df$c1))    # any na values in column c1

## [1] FALSE

na.omit(df)          # deletes all the na values

##    c1 c2
## 1   1 20
## 2   2 21
## 3   3 22
## 4   4 23
## 5   5 24
## 6   6 25
## 7   7 26
## 8   8 27
## 9   9 28
## 10 10 29
## 11 11 30

0.2 DPLYR

Why to use Dplyr ?

Great for exploratory data analysis and data manipulation (wrangling/munging).
Easy to write and has clear syntax. Powerful operation like chaining is very useful.
Fast and efficient on data frames operations.

library(dplyr)     # load the packages
library(hflights)

We will be using the hflights dataset which has the data for the flights departing from two Houston airports in 2011.

data(hflights)  # load the data
head(hflights)  # looks at first few rows

##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0

#tail(hlights)  # looks at the last few rows
str(hflights)   # displays the structure of the dataframe

## 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...

Dplyr Verbs and functions :-

filter - Filter the rows of dataframe based on certain condition.
select - Selects the particular column(s) of the dataframe.
arrange - Sorts the dataframe in ascending order. Use desc for sorting in descending order.
mutate - Creates new columns from the existing columns.
group_by - Creates groups that will be operated on.
summarise - Summarises each group to provide aggregated values.
n_distinct(vector) - Counts the number of unique items in the vector.
n() - Counts the number of rows in each group.
tbl_df() - Creates a local dataframe that prints nicely.

flights <- tbl_df(hflights)  # creates a local dataframe that prints nicely.
flights

## # A tibble: 227,496 x 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##  * <int> <int>      <int>     <int>   <int>   <int>         <chr>
##  1  2011     1          1         6    1400    1500            AA
##  2  2011     1          2         7    1401    1501            AA
##  3  2011     1          3         1    1352    1502            AA
##  4  2011     1          4         2    1403    1513            AA
##  5  2011     1          5         3    1405    1507            AA
##  6  2011     1          6         4    1359    1503            AA
##  7  2011     1          7         5    1359    1509            AA
##  8  2011     1          8         6    1355    1454            AA
##  9  2011     1          9         7    1443    1554            AA
## 10  2011     1         10         1    1443    1553            AA
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

0.2.1 Filter

filter(flights, Month == 1 & DayofMonth == 1) # filter those rows where Month is 1 and DayofMonth is 1

## # A tibble: 552 x 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int>         <chr>
##  1  2011     1          1         6    1400    1500            AA
##  2  2011     1          1         6     728     840            AA
##  3  2011     1          1         6    1631    1736            AA
##  4  2011     1          1         6    1756    2112            AA
##  5  2011     1          1         6    1012    1347            AA
##  6  2011     1          1         6    1211    1325            AA
##  7  2011     1          1         6     557     906            AA
##  8  2011     1          1         6    1824    2106            AS
##  9  2011     1          1         6     654    1124            B6
## 10  2011     1          1         6    1639    2110            B6
## # ... with 542 more rows, and 14 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

# filter those rows where UniqueCarrier is "AA" or "UA"
filter(flights, UniqueCarrier == "AA" | UniqueCarrier == "UA")

## # A tibble: 5,316 x 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int>         <chr>
##  1  2011     1          1         6    1400    1500            AA
##  2  2011     1          2         7    1401    1501            AA
##  3  2011     1          3         1    1352    1502            AA
##  4  2011     1          4         2    1403    1513            AA
##  5  2011     1          5         3    1405    1507            AA
##  6  2011     1          6         4    1359    1503            AA
##  7  2011     1          7         5    1359    1509            AA
##  8  2011     1          8         6    1355    1454            AA
##  9  2011     1          9         7    1443    1554            AA
## 10  2011     1         10         1    1443    1553            AA
## # ... with 5,306 more rows, and 14 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

0.2.2 Select

select(flights,DepTime,ArrTime,FlightNum) # select only the specified columns

## # A tibble: 227,496 x 3
##    DepTime ArrTime FlightNum
##  *   <int>   <int>     <int>
##  1    1400    1500       428
##  2    1401    1501       428
##  3    1352    1502       428
##  4    1403    1513       428
##  5    1405    1507       428
##  6    1359    1503       428
##  7    1359    1509       428
##  8    1355    1454       428
##  9    1443    1554       428
## 10    1443    1553       428
## # ... with 227,486 more rows

select(flights,Year:DayofMonth,contains("Taxi"),contains("Delay"))  # use contains if column names are big

## # A tibble: 227,496 x 7
##     Year Month DayofMonth TaxiIn TaxiOut ArrDelay DepDelay
##  * <int> <int>      <int>  <int>   <int>    <int>    <int>
##  1  2011     1          1      7      13      -10        0
##  2  2011     1          2      6       9       -9        1
##  3  2011     1          3      5      17       -8       -8
##  4  2011     1          4      9      22        3        3
##  5  2011     1          5      9       9       -3        5
##  6  2011     1          6      6      13       -7       -1
##  7  2011     1          7     12      15       -1       -1
##  8  2011     1          8      7      12      -16       -5
##  9  2011     1          9      8      22       44       43
## 10  2011     1         10      6      19       43       43
## # ... with 227,486 more rows

0.2.3 Chaining or Pipelining

# chaining example where first you select the specified columns and then filter only those rows based on the speicfied condition

flights %>% select(UniqueCarrier,DepDelay) %>% filter(DepDelay > 60)

## # A tibble: 10,242 x 2
##    UniqueCarrier DepDelay
##            <chr>    <int>
##  1            AA       90
##  2            AA       67
##  3            AA       74
##  4            AA      125
##  5            AA       82
##  6            AA       99
##  7            AA       70
##  8            AA       61
##  9            AA       74
## 10            AS       73
## # ... with 10,232 more rows

pipe_vector1 <- c(1:5)               # chaining can be used in general R programming as well 
pipe_vector2 <- c(2:6)

(pipe_vector1-pipe_vector2)^2 %>% sum() %>% sqrt()

## [1] 2.236068

0.2.4 Arrange

# use arrange to order the dataframe in ascending order
flights %>% select(UniqueCarrier,DepDelay) %>% arrange(DepDelay)

## # A tibble: 227,496 x 2
##    UniqueCarrier DepDelay
##            <chr>    <int>
##  1            OO      -33
##  2            MQ      -23
##  3            XE      -19
##  4            XE      -19
##  5            CO      -18
##  6            EV      -18
##  7            XE      -17
##  8            CO      -17
##  9            XE      -17
## 10            MQ      -17
## # ... with 227,486 more rows

# use desc within arrange to order the dataframe in descending order
#flights %>% select(UniqueCarrier,DepDelay) %>% arrange(desc(DepDelay))

0.2.5 Mutate

# use mutate to create a new column called Speed from the existing columns
flights %>% select(Distance,AirTime) %>% mutate(Speed = Distance/AirTime*60)

## # A tibble: 227,496 x 3
##    Distance AirTime    Speed
##       <int>   <int>    <dbl>
##  1      224      40 336.0000
##  2      224      45 298.6667
##  3      224      48 280.0000
##  4      224      39 344.6154
##  5      224      44 305.4545
##  6      224      45 298.6667
##  7      224      43 312.5581
##  8      224      40 336.0000
##  9      224      41 327.8049
## 10      224      45 298.6667
## # ... with 227,486 more rows

#flights <- flights %>% mutate(Speed = Distance/AirTime*60) # save the new column to the dataframe

0.2.6 Summarise and group_by

# group by Dest and then calculate the mean avg_delay for each particular group
flights %>% group_by(Dest) %>% summarise(avg_delay = mean(ArrDelay,na.rm=TRUE))

## # A tibble: 116 x 2
##     Dest  avg_delay
##    <chr>      <dbl>
##  1   ABQ   7.226259
##  2   AEX   5.839437
##  3   AGS   4.000000
##  4   AMA   6.840095
##  5   ANC  26.080645
##  6   ASE   6.794643
##  7   ATL   8.233251
##  8   AUS   7.448718
##  9   AVL   9.973988
## 10   BFL -13.198807
## # ... with 106 more rows

# same as above but this time we are caluculating the mean on two columns (Cancelled and Diverted)
flights %>% group_by(UniqueCarrier) %>% summarise_each(funs(mean),Cancelled,Diverted)

## # A tibble: 15 x 3
##    UniqueCarrier   Cancelled    Diverted
##            <chr>       <dbl>       <dbl>
##  1            AA 0.018495684 0.001849568
##  2            AS 0.000000000 0.002739726
##  3            B6 0.025899281 0.005755396
##  4            CO 0.006782614 0.002627370
##  5            DL 0.015903067 0.003029156
##  6            EV 0.034482759 0.003176044
##  7            F9 0.007159905 0.000000000
##  8            FL 0.009817672 0.003272557
##  9            MQ 0.029044750 0.001936317
## 10            OO 0.013946828 0.003486707
## 11            UA 0.016409266 0.002413127
## 12            US 0.011268986 0.001469868
## 13            WN 0.015504047 0.002293629
## 14            XE 0.015495599 0.003449550
## 15            YV 0.012658228 0.000000000

# can also use matches to match the column names
flights %>% group_by(UniqueCarrier) %>% summarise_each(funs(min(.,na.rm=T),max(.,na.rm=T)),matches("Delay"))

## # A tibble: 15 x 5
##    UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
##            <chr>        <int>        <int>        <int>        <int>
##  1            AA          -39          -15          978          970
##  2            AS          -43          -15          183          172
##  3            B6          -44          -14          335          310
##  4            CO          -55          -18          957          981
##  5            DL          -32          -17          701          730
##  6            EV          -40          -18          469          479
##  7            F9          -24          -15          277          275
##  8            FL          -30          -14          500          507
##  9            MQ          -38          -23          918          931
## 10            OO          -57          -33          380          360
## 11            UA          -47          -11          861          869
## 12            US          -42          -17          433          425
## 13            WN          -44          -10          499          548
## 14            XE          -70          -19          634          628
## 15            YV          -32          -11           72           54

0.2.7 n() n_distinct(vector)

# n() is used to count the number of rows within each groups 
flights %>% group_by(Month,DayofMonth) %>% summarise(flight_count = n()) %>% arrange(desc(flight_count))

## # A tibble: 365 x 3
## # Groups:   Month [12]
##    Month DayofMonth flight_count
##    <int>      <int>        <int>
##  1     8          4          706
##  2     8         11          706
##  3     8         12          706
##  4     8          5          705
##  5     8          3          704
##  6     8         10          704
##  7     1          3          702
##  8     7          7          702
##  9     7         14          702
## 10     7         28          701
## # ... with 355 more rows

# n_distinct() gives you unique or distinct values within each group
flights %>% group_by(Dest) %>% summarise(flight_count = n(), plane_count = n_distinct(TailNum))

## # A tibble: 116 x 3
##     Dest flight_count plane_count
##    <chr>        <int>       <int>
##  1   ABQ         2812         716
##  2   AEX          724         215
##  3   AGS            1           1
##  4   AMA         1297         158
##  5   ANC          125          38
##  6   ASE          125          60
##  7   ATL         7886         983
##  8   AUS         5022        1015
##  9   AVL          350         142
## 10   BFL          504          70
## # ... with 106 more rows

0.2.8 Other functions

glimpse(flights)  # same as str() function but much better looking

## Observations: 227,496
## Variables: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

flights %>% sample_n(10)  # randomly selects 10 rows from the dataframe

## # A tibble: 10 x 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int>         <chr>
##  1  2011     3         30         3     925    1045            OO
##  2  2011     5         16         1    1805    2044            CO
##  3  2011     9         19         1    2109    2202            XE
##  4  2011     9         14         3    1733    2154            CO
##  5  2011    12         31         6    1900    2049            XE
##  6  2011    12         25         7    1531    1743            WN
##  7  2011     5         20         5     744    1149            CO
##  8  2011     6          6         1    1034    1356            CO
##  9  2011     6          9         4    1150    1316            CO
## 10  2011     5          2         1    1247    1405            XE
## # ... with 14 more variables: FlightNum <int>, TailNum <chr>,
## #   ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

This is the dplyr tutorial which explains data manipulation using dplyr in R. https://www.youtube.com/watch?v=jWjqLW-u3hc

This is the Rpubs document which explains the dplyr tutorial along with comparing it with base R functions. https://rpubs.com/justmarkham/dplyr-tutorial