## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
got <- read.csv("got.csv", header = TRUE, stringsAsFactors = FALSE)
kable(got)
lastname firstname major year gpa
Snow John Nordic Studies Junior 3.23
Lannister Tyrion Communications Sophomore 3.83
Targaryen Daenerys Zoology Freshman 3.36
Bolton Ramsay Phys Ed Freshman 2.24
Stark Eddard History Senior 2.78
Clegane Gregor Phys Ed Sophomore 3.23
Baelish Peter Communications Freshman 2.84
Baratheon Joffrey History Freshman 1.87
Drogo Khal Zoology Senior 3.38
Tarly Samwise Nordic Studies Freshman 2.39

Practice dplyr verbs

select()

  1. Select columns
  2. Change names
kable(got %>% select(surname = lastname, 3:5))
surname major year gpa
Snow Nordic Studies Junior 3.23
Lannister Communications Sophomore 3.83
Targaryen Zoology Freshman 3.36
Bolton Phys Ed Freshman 2.24
Stark History Senior 2.78
Clegane Phys Ed Sophomore 3.23
Baelish Communications Freshman 2.84
Baratheon History Freshman 1.87
Drogo Zoology Senior 3.38
Tarly Nordic Studies Freshman 2.39

Can also use rename() to change names

kable(got %>% rename(surname = lastname))
surname firstname major year gpa
Snow John Nordic Studies Junior 3.23
Lannister Tyrion Communications Sophomore 3.83
Targaryen Daenerys Zoology Freshman 3.36
Bolton Ramsay Phys Ed Freshman 2.24
Stark Eddard History Senior 2.78
Clegane Gregor Phys Ed Sophomore 3.23
Baelish Peter Communications Freshman 2.84
Baratheon Joffrey History Freshman 1.87
Drogo Khal Zoology Senior 3.38
Tarly Samwise Nordic Studies Freshman 2.39

If want to see only those with a gpa >= 3.5, use filter()

Also used rename() to change the lastname variable

kable(got %>% filter(gpa >= 3.5) %>% rename(surname = lastname))
surname firstname major year gpa
Lannister Tyrion Communications Sophomore 3.83

Sort from highest to lowest, use arrange()

kable(got %>% arrange(desc(gpa)))
lastname firstname major year gpa
Lannister Tyrion Communications Sophomore 3.83
Drogo Khal Zoology Senior 3.38
Targaryen Daenerys Zoology Freshman 3.36
Snow John Nordic Studies Junior 3.23
Clegane Gregor Phys Ed Sophomore 3.23
Baelish Peter Communications Freshman 2.84
Stark Eddard History Senior 2.78
Tarly Samwise Nordic Studies Freshman 2.39
Bolton Ramsay Phys Ed Freshman 2.24
Baratheon Joffrey History Freshman 1.87

Create new columns with transformed data, typically, use mutate()

kable(got %>% arrange(desc(gpa)) %>% mutate(dlist = gpa >=3.5))
lastname firstname major year gpa dlist
Lannister Tyrion Communications Sophomore 3.83 TRUE
Drogo Khal Zoology Senior 3.38 FALSE
Targaryen Daenerys Zoology Freshman 3.36 FALSE
Snow John Nordic Studies Junior 3.23 FALSE
Clegane Gregor Phys Ed Sophomore 3.23 FALSE
Baelish Peter Communications Freshman 2.84 FALSE
Stark Eddard History Senior 2.78 FALSE
Tarly Samwise Nordic Studies Freshman 2.39 FALSE
Bolton Ramsay Phys Ed Freshman 2.24 FALSE
Baratheon Joffrey History Freshman 1.87 FALSE

To transform the data, but only show that newly created column, use transmute()

kable(got %>% transmute(name = paste(firstname, lastname), dlist = gpa >= 3.5))
name dlist
John Snow FALSE
Tyrion Lannister TRUE
Daenerys Targaryen FALSE
Ramsay Bolton FALSE
Eddard Stark FALSE
Gregor Clegane FALSE
Peter Baelish FALSE
Joffrey Baratheon FALSE
Khal Drogo FALSE
Samwise Tarly FALSE

To look at distinct outputs, like the majors in this data frame, use distinct()

Also, can use arrange() to put the majors in alphabetical order

pander(got %>% distinct(major) %>% arrange(major))
major
Communications
History
Nordic Studies
Phys Ed
Zoology

To do aggregating, use group_by() and often summarise()

Here, to aggreagte by major, and then summarize by the average gpa…

kable(got %>% group_by(major) %>% summarise(average_gpa = mean(gpa)))
major average_gpa
Communications 3.335
History 2.325
Nordic Studies 2.810
Phys Ed 2.735
Zoology 3.370

To see a particular segment of a group, use slice() which is a window function

Find the best student in each major

kable(got %>% group_by(major) %>% arrange(desc(gpa)) %>% slice(1))
lastname firstname major year gpa
Lannister Tyrion Communications Sophomore 3.83
Stark Eddard History Senior 2.78
Snow John Nordic Studies Junior 3.23
Clegane Gregor Phys Ed Sophomore 3.23
Drogo Khal Zoology Senior 3.38

To do find the top , use top_n()

kable(got %>% group_by(major) %>% arrange(desc(gpa)) %>% top_n(1))
## Selecting by gpa
lastname firstname major year gpa
Lannister Tyrion Communications Sophomore 3.83
Drogo Khal Zoology Senior 3.38
Snow John Nordic Studies Junior 3.23
Clegane Gregor Phys Ed Sophomore 3.23
Stark Eddard History Senior 2.78

To find the bottom of a group, use min_rank()

kable(got %>% group_by(major) %>% filter(min_rank(desc(gpa))== 1))
lastname firstname major year gpa
Snow John Nordic Studies Junior 3.23
Lannister Tyrion Communications Sophomore 3.83
Stark Eddard History Senior 2.78
Clegane Gregor Phys Ed Sophomore 3.23
Drogo Khal Zoology Senior 3.38

Window functions can be broken into 2 categories

  1. offset functions
  2. ranking and ordering functions

Who is the lowest ranked gpa? Can use row_number()

kable(got %>% filter(row_number(desc(gpa)) == 7))
lastname firstname major year gpa
Stark Eddard History Senior 2.78

Or can use filter(min_rank())

kable(got %>% filter(min_rank(desc(gpa)) == 7))
lastname firstname major year gpa
Stark Eddard History Senior 2.78

It functions a litle differently than row number, i.e., if there’s a tie.

kable(got %>% filter(min_rank(desc(gpa)) == 4))
lastname firstname major year gpa
Snow John Nordic Studies Junior 3.23
Clegane Gregor Phys Ed Sophomore 3.23

There’s dense_rank()

To find the sixth ranked person, use 5

kable(got %>% filter(dense_rank(desc(gpa)) == 5))
lastname firstname major year gpa
Baelish Peter Communications Freshman 2.84

Should use slice()

kable(got %>% arrange(desc(gpa)) %>% slice(6))
lastname firstname major year gpa
Baelish Peter Communications Freshman 2.84

To find where a variable falls percentage wise, can use percent_rank()

kable(got %>% arrange(desc(gpa))%>% mutate(p_rank = percent_rank(gpa)), digits = 2) 
lastname firstname major year gpa p_rank
Lannister Tyrion Communications Sophomore 3.83 1.00
Drogo Khal Zoology Senior 3.38 0.89
Targaryen Daenerys Zoology Freshman 3.36 0.78
Snow John Nordic Studies Junior 3.23 0.56
Clegane Gregor Phys Ed Sophomore 3.23 0.56
Baelish Peter Communications Freshman 2.84 0.44
Stark Eddard History Senior 2.78 0.33
Tarly Samwise Nordic Studies Freshman 2.39 0.22
Bolton Ramsay Phys Ed Freshman 2.24 0.11
Baratheon Joffrey History Freshman 1.87 0.00

To find the proportion of all values less than or equal to the current rank, use cume_rank()

kable(got %>% arrange(desc(gpa))%>% mutate(cum_dist = cume_dist(gpa)), digits = 2) 
lastname firstname major year gpa cum_dist
Lannister Tyrion Communications Sophomore 3.83 1.0
Drogo Khal Zoology Senior 3.38 0.9
Targaryen Daenerys Zoology Freshman 3.36 0.8
Snow John Nordic Studies Junior 3.23 0.7
Clegane Gregor Phys Ed Sophomore 3.23 0.7
Baelish Peter Communications Freshman 2.84 0.5
Stark Eddard History Senior 2.78 0.4
Tarly Samwise Nordic Studies Freshman 2.39 0.3
Bolton Ramsay Phys Ed Freshman 2.24 0.2
Baratheon Joffrey History Freshman 1.87 0.1

To find a rough rank, use ntile() which breaks the vector into n buckets

kable(got %>% arrange(desc(gpa)) %>% mutate(ntile = ntile(gpa, 4)))
lastname firstname major year gpa ntile
Lannister Tyrion Communications Sophomore 3.83 4
Drogo Khal Zoology Senior 3.38 4
Targaryen Daenerys Zoology Freshman 3.36 3
Snow John Nordic Studies Junior 3.23 3
Clegane Gregor Phys Ed Sophomore 3.23 3
Baelish Peter Communications Freshman 2.84 2
Stark Eddard History Senior 2.78 2
Tarly Samwise Nordic Studies Freshman 2.39 1
Bolton Ramsay Phys Ed Freshman 2.24 1
Baratheon Joffrey History Freshman 1.87 1

Can put these altoghther to get three variables

kable(got %>% arrange(desc(gpa)) %>% mutate(prank = percent_rank(gpa), crank = cume_dist(gpa), nrank = ntile(gpa, 4)), digits = 2)
lastname firstname major year gpa prank crank nrank
Lannister Tyrion Communications Sophomore 3.83 1.00 1.0 4
Drogo Khal Zoology Senior 3.38 0.89 0.9 4
Targaryen Daenerys Zoology Freshman 3.36 0.78 0.8 3
Snow John Nordic Studies Junior 3.23 0.56 0.7 3
Clegane Gregor Phys Ed Sophomore 3.23 0.56 0.7 3
Baelish Peter Communications Freshman 2.84 0.44 0.5 2
Stark Eddard History Senior 2.78 0.33 0.4 2
Tarly Samwise Nordic Studies Freshman 2.39 0.22 0.3 1
Bolton Ramsay Phys Ed Freshman 2.24 0.11 0.2 1
Baratheon Joffrey History Freshman 1.87 0.00 0.1 1

To find the previous value in a vector, use lag()

kable(got %>% arrange(desc(gpa)) %>% mutate(OneBetterGot = lag(gpa )))
lastname firstname major year gpa OneBetterGot
Lannister Tyrion Communications Sophomore 3.83 NA
Drogo Khal Zoology Senior 3.38 3.83
Targaryen Daenerys Zoology Freshman 3.36 3.38
Snow John Nordic Studies Junior 3.23 3.36
Clegane Gregor Phys Ed Sophomore 3.23 3.23
Baelish Peter Communications Freshman 2.84 3.23
Stark Eddard History Senior 2.78 2.84
Tarly Samwise Nordic Studies Freshman 2.39 2.78
Bolton Ramsay Phys Ed Freshman 2.24 2.39
Baratheon Joffrey History Freshman 1.87 2.24

To see the value of the next below, use lead()

kable(got %>% arrange(desc(gpa)) %>% mutate(OneWorseGot = lead(gpa)))
lastname firstname major year gpa OneWorseGot
Lannister Tyrion Communications Sophomore 3.83 3.38
Drogo Khal Zoology Senior 3.38 3.36
Targaryen Daenerys Zoology Freshman 3.36 3.23
Snow John Nordic Studies Junior 3.23 3.23
Clegane Gregor Phys Ed Sophomore 3.23 2.84
Baelish Peter Communications Freshman 2.84 2.78
Stark Eddard History Senior 2.78 2.39
Tarly Samwise Nordic Studies Freshman 2.39 2.24
Bolton Ramsay Phys Ed Freshman 2.24 1.87
Baratheon Joffrey History Freshman 1.87 NA

And can combine them

kable(got %>% mutate(Name = paste(lastname, firstname)) %>%  arrange(desc(gpa)) %>% mutate(OneBetter = lag(gpa), OneWorse = lead(gpa))%>% select(Name, year, major, gpa,  OneBetter, OneWorse))
Name year major gpa OneBetter OneWorse
Lannister Tyrion Sophomore Communications 3.83 NA 3.38
Drogo Khal Senior Zoology 3.38 3.83 3.36
Targaryen Daenerys Freshman Zoology 3.36 3.38 3.23
Snow John Junior Nordic Studies 3.23 3.36 3.23
Clegane Gregor Sophomore Phys Ed 3.23 3.23 2.84
Baelish Peter Freshman Communications 2.84 3.23 2.78
Stark Eddard Senior History 2.78 2.84 2.39
Tarly Samwise Freshman Nordic Studies 2.39 2.78 2.24
Bolton Ramsay Freshman Phys Ed 2.24 2.39 1.87
Baratheon Joffrey Freshman History 1.87 2.24 NA

Create a variable of the major with the worst mean gpa

worstmajor <- got %>% group_by(major) %>% summarise(meangpa = mean(gpa)) %>% filter(min_rank(meangpa)== 1) %>% .$major
worstmajor
## [1] "History"

Show all the information on all of the students in the worst major

worst <- got %>% filter(major == worstmajor)
worst
##    lastname firstname   major     year  gpa
## 1     Stark    Eddard History   Senior 2.78
## 2 Baratheon   Joffrey History Freshman 1.87

Find the top 25%, they make up the dean’s list

deans <- got %>% arrange(desc(gpa)) %>% filter(cume_dist(gpa)>= .75)
deans
##    lastname firstname          major      year  gpa
## 1 Lannister    Tyrion Communications Sophomore 3.83
## 2     Drogo      Khal        Zoology    Senior 3.38
## 3 Targaryen  Daenerys        Zoology  Freshman 3.36

How many gpa points separate the worst student from the best not on the dean’s list?

got %>% arrange(desc(gpa)) %>% mutate(diff = lag(gpa,1)-gpa) %>% filter(cume_dist(gpa)< .75) %>% slice(1) %>% .$diff
## [1] 0.13

How many gpa points separate the best and worst freshmen?

got %>% group_by(year) %>% filter(year == "Freshman") %>% 
summarise(diff = max(gpa)- min(gpa)) 
## # A tibble: 1 × 2
##       year  diff
##      <chr> <dbl>
## 1 Freshman  1.49

Make a list that shows the full_name, major, and year of students not in Phys Ed or Zoology

GList <-got %>% filter(!major %in% c('Phys Ed', 'Zoology')) %>% transmute(full_name = paste(firstname, lastname), major = major, year = year) %>% arrange(full_name)
kable(GList)
full_name major year
Eddard Stark History Senior
Joffrey Baratheon History Freshman
John Snow Nordic Studies Junior
Peter Baelish Communications Freshman
Samwise Tarly Nordic Studies Freshman
Tyrion Lannister Communications Sophomore

Create a tally of students by year

tally <- got %>% group_by(year) %>% summarise(count = n())
pander(tally)
year count
Freshman 5
Junior 1
Senior 2
Sophomore 2

Fotmat the table to wide

library(tidyr)
set.caption("Student Count by Year")
pander(tally %>% spread(year, count))
Student Count by Year
Freshman Junior Senior Sophomore
5 1 2 2

Create a scatter plot with gpa on the x-axis and top_n_percentile on the y.

got %>% arrange(gpa) %>% mutate(top_n_percentile = 1- percent_rank(gpa)) %>% ggvis(~gpa, ~top_n_percentile) %>% layer_points(fill := "red", size := 150)

Practice Questions Unit 6

FlyMe <- read.csv("me_flights1.csv", header = TRUE, stringsAsFactors = FALSE)
kable(head(FlyMe) %>% select(1:5))
FlightDate Carrier TailNum FlightNum Origin
1/5/2016 AA N745VJ 1815 PWM
1/1/2016 AA N748UW 1815 PWM
1/6/2016 AA N829AW 1815 PWM
1/4/2016 AA N755US 1815 PWM
1/7/2016 AA N818AW 1815 PWM
1/8/2016 AA N765US 1815 PWM
kable(names(FlyMe))
FlightDate
Carrier
TailNum
FlightNum
Origin
Dest
DestCityName
DestState
DestStateName
CRSDepTime
DepTime
WheelsOff
WheelsOn
CRSArrTime
ArrTime
Cancelled
Diverted
CRSElapsedTime
ActualElapsedTime
Distance
kable(head(FlyMe))
FlightDate Carrier TailNum FlightNum Origin Dest DestCityName DestState DestStateName CRSDepTime DepTime WheelsOff WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime Distance
1/5/2016 AA N745VJ 1815 PWM CLT Charlotte, NC NC North Carolina 800 906 923 1111 1047 1127 0 0 167 141 813
1/1/2016 AA N748UW 1815 PWM CLT Charlotte, NC NC North Carolina 805 802 814 1034 1050 1040 0 0 165 158 813
1/6/2016 AA N829AW 1815 PWM CLT Charlotte, NC NC North Carolina 800 755 815 1022 1047 1027 0 0 167 152 813
1/4/2016 AA N755US 1815 PWM CLT Charlotte, NC NC North Carolina 805 803 817 1037 1050 1044 0 0 165 161 813
1/7/2016 AA N818AW 1815 PWM CLT Charlotte, NC NC North Carolina 800 758 821 1034 1047 1039 0 0 167 161 813
1/8/2016 AA N765US 1815 PWM CLT Charlotte, NC NC North Carolina 800 801 842 1048 1047 1053 0 0 167 172 813
str(FlyMe)
## 'data.frame':    383 obs. of  20 variables:
##  $ FlightDate       : chr  "1/5/2016" "1/1/2016" "1/6/2016" "1/4/2016" ...
##  $ Carrier          : chr  "AA" "AA" "AA" "AA" ...
##  $ TailNum          : chr  "N745VJ" "N748UW" "N829AW" "N755US" ...
##  $ FlightNum        : int  1815 1815 1815 1815 1815 1815 850 653 1815 1815 ...
##  $ Origin           : chr  "PWM" "PWM" "PWM" "PWM" ...
##  $ Dest             : chr  "CLT" "CLT" "CLT" "CLT" ...
##  $ DestCityName     : chr  "Charlotte, NC" "Charlotte, NC" "Charlotte, NC" "Charlotte, NC" ...
##  $ DestState        : chr  "NC" "NC" "NC" "NC" ...
##  $ DestStateName    : chr  "North Carolina" "North Carolina" "North Carolina" "North Carolina" ...
##  $ CRSDepTime       : int  800 805 800 805 800 800 800 910 800 800 ...
##  $ DepTime          : int  906 802 755 803 758 801 755 903 817 757 ...
##  $ WheelsOff        : int  923 814 815 817 821 842 815 919 828 850 ...
##  $ WheelsOn         : int  1111 1034 1022 1037 1034 1048 1026 1130 1039 1057 ...
##  $ CRSArrTime       : int  1047 1050 1047 1050 1047 1047 1047 1201 1047 1047 ...
##  $ ArrTime          : int  1127 1040 1027 1044 1039 1053 1030 1138 1045 1102 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRSElapsedTime   : int  167 165 167 165 167 167 167 171 167 167 ...
##  $ ActualElapsedTime: int  141 158 152 161 161 172 155 155 148 185 ...
##  $ Distance         : int  813 813 813 813 813 813 813 813 813 813 ...

Convert FlightDate from 01/01/2016 to the format used in R.

FlyMe$FlightDate <- as.Date(FlyMe$FlightDate, format ="%m/%d/%Y")
FlyMe <- FlyMe %>% mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)),
                          
                          new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)),
                          
                          new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)),
                          
                          new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)),
                          
                          new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)),
                          
                          new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)))
FlyMe$new_CRSDepTime <- as.POSIXct(FlyMe$new_CRSDepTime, format = "%Y-%m-%d %H%M")

FlyMe$new_DepTime <- as.POSIXct(FlyMe$new_DepTime, format = "%Y-%m-%d %H%M")

FlyMe$new_CRSArrTime <- as.POSIXct(FlyMe$new_CRSArrTime, format = "%Y-%m-%d %H%M")

FlyMe$new_ArrTime <- as.POSIXct(FlyMe$new_ArrTime, format = "%Y-%m-%d %H%M")

FlyMe$new_WheelsOff <- as.POSIXct(FlyMe$new_WheelsOff, format = "%Y-%m-%d %H%M")

FlyMe$new_WheelsOn <- as.POSIXct(FlyMe$new_WheelsOn, format = "%Y-%m-%d %H%M")
kable(head(FlyMe))
FlightDate Carrier TailNum FlightNum Origin Dest DestCityName DestState DestStateName CRSDepTime DepTime WheelsOff WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime Distance new_CRSDepTime new_DepTime new_CRSArrTime new_ArrTime new_WheelsOff new_WheelsOn
2016-01-05 AA N745VJ 1815 PWM CLT Charlotte, NC NC North Carolina 800 906 923 1111 1047 1127 0 0 167 141 813 2016-01-05 08:00:00 2016-01-05 09:06:00 2016-01-05 10:47:00 2016-01-05 11:27:00 2016-01-05 09:23:00 2016-01-05 11:11:00
2016-01-01 AA N748UW 1815 PWM CLT Charlotte, NC NC North Carolina 805 802 814 1034 1050 1040 0 0 165 158 813 2016-01-01 08:05:00 2016-01-01 08:02:00 2016-01-01 10:50:00 2016-01-01 10:40:00 2016-01-01 08:14:00 2016-01-01 10:34:00
2016-01-06 AA N829AW 1815 PWM CLT Charlotte, NC NC North Carolina 800 755 815 1022 1047 1027 0 0 167 152 813 2016-01-06 08:00:00 2016-01-06 07:55:00 2016-01-06 10:47:00 2016-01-06 10:27:00 2016-01-06 08:15:00 2016-01-06 10:22:00
2016-01-04 AA N755US 1815 PWM CLT Charlotte, NC NC North Carolina 805 803 817 1037 1050 1044 0 0 165 161 813 2016-01-04 08:05:00 2016-01-04 08:03:00 2016-01-04 10:50:00 2016-01-04 10:44:00 2016-01-04 08:17:00 2016-01-04 10:37:00
2016-01-07 AA N818AW 1815 PWM CLT Charlotte, NC NC North Carolina 800 758 821 1034 1047 1039 0 0 167 161 813 2016-01-07 08:00:00 2016-01-07 07:58:00 2016-01-07 10:47:00 2016-01-07 10:39:00 2016-01-07 08:21:00 2016-01-07 10:34:00
2016-01-08 AA N765US 1815 PWM CLT Charlotte, NC NC North Carolina 800 801 842 1048 1047 1053 0 0 167 172 813 2016-01-08 08:00:00 2016-01-08 08:01:00 2016-01-08 10:47:00 2016-01-08 10:53:00 2016-01-08 08:42:00 2016-01-08 10:48:00
str(FlyMe)
## 'data.frame':    383 obs. of  26 variables:
##  $ FlightDate       : Date, format: "2016-01-05" "2016-01-01" ...
##  $ Carrier          : chr  "AA" "AA" "AA" "AA" ...
##  $ TailNum          : chr  "N745VJ" "N748UW" "N829AW" "N755US" ...
##  $ FlightNum        : int  1815 1815 1815 1815 1815 1815 850 653 1815 1815 ...
##  $ Origin           : chr  "PWM" "PWM" "PWM" "PWM" ...
##  $ Dest             : chr  "CLT" "CLT" "CLT" "CLT" ...
##  $ DestCityName     : chr  "Charlotte, NC" "Charlotte, NC" "Charlotte, NC" "Charlotte, NC" ...
##  $ DestState        : chr  "NC" "NC" "NC" "NC" ...
##  $ DestStateName    : chr  "North Carolina" "North Carolina" "North Carolina" "North Carolina" ...
##  $ CRSDepTime       : int  800 805 800 805 800 800 800 910 800 800 ...
##  $ DepTime          : int  906 802 755 803 758 801 755 903 817 757 ...
##  $ WheelsOff        : int  923 814 815 817 821 842 815 919 828 850 ...
##  $ WheelsOn         : int  1111 1034 1022 1037 1034 1048 1026 1130 1039 1057 ...
##  $ CRSArrTime       : int  1047 1050 1047 1050 1047 1047 1047 1201 1047 1047 ...
##  $ ArrTime          : int  1127 1040 1027 1044 1039 1053 1030 1138 1045 1102 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRSElapsedTime   : int  167 165 167 165 167 167 167 171 167 167 ...
##  $ ActualElapsedTime: int  141 158 152 161 161 172 155 155 148 185 ...
##  $ Distance         : int  813 813 813 813 813 813 813 813 813 813 ...
##  $ new_CRSDepTime   : POSIXct, format: "2016-01-05 08:00:00" "2016-01-01 08:05:00" ...
##  $ new_DepTime      : POSIXct, format: "2016-01-05 09:06:00" "2016-01-01 08:02:00" ...
##  $ new_CRSArrTime   : POSIXct, format: "2016-01-05 10:47:00" "2016-01-01 10:50:00" ...
##  $ new_ArrTime      : POSIXct, format: "2016-01-05 11:27:00" "2016-01-01 10:40:00" ...
##  $ new_WheelsOff    : POSIXct, format: "2016-01-05 09:23:00" "2016-01-01 08:14:00" ...
##  $ new_WheelsOn     : POSIXct, format: "2016-01-05 11:11:00" "2016-01-01 10:34:00" ...
FlyMe <- FlyMe %>% 
  mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")), 
         
         ArrDelay = as.integer(difftime(new_ArrTime, new_CRSArrTime, units = "mins")),
         
         TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")), 
         
         TaxiIn = as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins")), 
         
         AirDelayMinutes = ifelse(ArrDelay < 0, 0, ArrDelay),
         
         ArrDelay15 = ifelse(ArrDelay >= 15,1,0), 
         
         FlightTimeBuffer = CRSElapsedTime - ActualElapsedTime, AirTime = ActualElapsedTime - TaxiOut - TaxiIn, 
AirSpeed = Distance/(AirTime/60))

kable(head(FlyMe))
FlightDate Carrier TailNum FlightNum Origin Dest DestCityName DestState DestStateName CRSDepTime DepTime WheelsOff WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime ActualElapsedTime Distance new_CRSDepTime new_DepTime new_CRSArrTime new_ArrTime new_WheelsOff new_WheelsOn DepDelay ArrDelay TaxiOut TaxiIn AirDelayMinutes ArrDelay15 FlightTimeBuffer AirTime AirSpeed
2016-01-05 AA N745VJ 1815 PWM CLT Charlotte, NC NC North Carolina 800 906 923 1111 1047 1127 0 0 167 141 813 2016-01-05 08:00:00 2016-01-05 09:06:00 2016-01-05 10:47:00 2016-01-05 11:27:00 2016-01-05 09:23:00 2016-01-05 11:11:00 66 40 17 16 40 1 26 108 451.6667
2016-01-01 AA N748UW 1815 PWM CLT Charlotte, NC NC North Carolina 805 802 814 1034 1050 1040 0 0 165 158 813 2016-01-01 08:05:00 2016-01-01 08:02:00 2016-01-01 10:50:00 2016-01-01 10:40:00 2016-01-01 08:14:00 2016-01-01 10:34:00 -3 -10 12 6 0 0 7 140 348.4286
2016-01-06 AA N829AW 1815 PWM CLT Charlotte, NC NC North Carolina 800 755 815 1022 1047 1027 0 0 167 152 813 2016-01-06 08:00:00 2016-01-06 07:55:00 2016-01-06 10:47:00 2016-01-06 10:27:00 2016-01-06 08:15:00 2016-01-06 10:22:00 -5 -20 20 5 0 0 15 127 384.0945
2016-01-04 AA N755US 1815 PWM CLT Charlotte, NC NC North Carolina 805 803 817 1037 1050 1044 0 0 165 161 813 2016-01-04 08:05:00 2016-01-04 08:03:00 2016-01-04 10:50:00 2016-01-04 10:44:00 2016-01-04 08:17:00 2016-01-04 10:37:00 -2 -6 14 7 0 0 4 140 348.4286
2016-01-07 AA N818AW 1815 PWM CLT Charlotte, NC NC North Carolina 800 758 821 1034 1047 1039 0 0 167 161 813 2016-01-07 08:00:00 2016-01-07 07:58:00 2016-01-07 10:47:00 2016-01-07 10:39:00 2016-01-07 08:21:00 2016-01-07 10:34:00 -2 -8 23 5 0 0 6 133 366.7669
2016-01-08 AA N765US 1815 PWM CLT Charlotte, NC NC North Carolina 800 801 842 1048 1047 1053 0 0 167 172 813 2016-01-08 08:00:00 2016-01-08 08:01:00 2016-01-08 10:47:00 2016-01-08 10:53:00 2016-01-08 08:42:00 2016-01-08 10:48:00 1 6 41 5 6 0 -5 126 387.1429

Is a pilot more likely to fly faster if there is a departure delay?

FlyMe %>% filter(Cancelled == 0) %>% mutate(delayed = DepDelay > 0) %>% group_by(delayed) %>% summarise(avg_AirSpeed = mean(AirSpeed))
## # A tibble: 2 × 2
##   delayed avg_AirSpeed
##     <lgl>        <dbl>
## 1   FALSE     335.0505
## 2    TRUE     350.7744

Is there a day of the week when pilots fly faster?

FlyMe2 <- FlyMe %>% filter(Cancelled == 0)

Using library(lubridate) for days of the week functions

Q3 <- FlyMe2 %>% group_by(day = wday(FlightDate, label = TRUE)) %>% summarise(Av_AirSpeed = mean(AirSpeed))
Q3 %>% ggvis(~day, ~Av_AirSpeed)
## Guessing layer_bars()

Do pilots fly faster on the weekend? Need to use library(chron)

library(chron)
## 
## Attaching package: 'chron'
## The following objects are masked from 'package:lubridate':
## 
##     days, hours, minutes, seconds, years
FlyMe2 %>% group_by((is.weekend(FlightDate))) %>% summarise(Av_AirSpeed = mean(AirSpeed))
## # A tibble: 2 × 2
##   `(is.weekend(FlightDate))` Av_AirSpeed
##                        <lgl>       <dbl>
## 1                      FALSE    337.0506
## 2                       TRUE    343.4683

Does it look like any of the arrivals were actually on the next day and they are incorrectly coded? Look for difference between departure and arrival times.

Q5 <- FlyMe2 %>% mutate(Time = difftime(new_ArrTime, new_DepTime), LongthanDay = ifelse(Time >=24, 1,0)) %>% group_by(LongthanDay) %>% summarise(count = n())
Q5
## # A tibble: 1 × 2
##   LongthanDay count
##         <dbl> <int>
## 1           0   361