library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.2.5
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.2.5
## Warning: package 'tibble' was built under R version 3.2.5
## Warning: package 'tidyr' was built under R version 3.2.5
## Warning: package 'readr' was built under R version 3.2.5
## Warning: package 'purrr' was built under R version 3.2.5
## Warning: package 'dplyr' was built under R version 3.2.5
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
*Dataset
# install hflights package as install.packages("hflights")
library(hflights)
## Warning: package 'hflights' was built under R version 3.2.5
# Check few first rows of dataset
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
# This dataset is in tuple format, convert to data frame : as.data.frame(hflights)
#Check number of rows and columns
dim(hflights)
## [1] 227496 21
#Check variables names
names(hflights)
## [1] "Year" "Month" "DayofMonth"
## [4] "DayOfWeek" "DepTime" "ArrTime"
## [7] "UniqueCarrier" "FlightNum" "TailNum"
## [10] "ActualElapsedTime" "AirTime" "ArrDelay"
## [13] "DepDelay" "Origin" "Dest"
## [16] "Distance" "TaxiIn" "TaxiOut"
## [19] "Cancelled" "CancellationCode" "Diverted"
# Check the structure of the dataset
glimpse(hflights)
## Observations: 227,496
## Variables: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
# or
str(hflights)
## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
# rename confusing code "UniqueCarrier" column
lut<-c("AA"="American","AS"="Alaska","B6"="JetBlue","CO"="Continental","DL"="Delta","OO"="SkyWest","AU"="United","US"="US_Airways","WN"="Southwest","EV"="Atlantic_Southeast","F9"="Frontier","FL"="AirTran","MQ"="American_Eagle","XE"="ExpressJet","YV"="Mesa")
# Create a new column
hflights$New_Carrier<-lut[hflights$UniqueCarrier]
# Encode cancellationCode
unique(hflights$CancellationCode)
## [1] "" "A" "B" "C" "D"
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
## New_Carrier
## 5424 American
## 5425 American
## 5426 American
## 5427 American
## 5428 American
## 5429 American
# Changing labels of CancellationCode
hflights$CancellationCode<-as.factor(hflights$CancellationCode)
my_flight<-hflights
# Create new column with new name labels
levels(my_flight$CancellationCode)<-c(NA,"Carrier","Weather","National Air System","Security")
tail(my_flight)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 6083254 2011 12 6 2 1307 1600 WN
## 6083255 2011 12 6 2 1818 2111 WN
## 6083256 2011 12 6 2 2047 2334 WN
## 6083257 2011 12 6 2 912 1031 WN
## 6083258 2011 12 6 2 656 812 WN
## 6083259 2011 12 6 2 1600 1713 WN
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay
## 6083254 471 N632SW 113 98 0 7
## 6083255 1191 N284WN 113 97 -9 8
## 6083256 1674 N366SW 107 94 4 7
## 6083257 127 N777QC 79 61 -4 -3
## 6083258 621 N727SW 76 64 -13 -4
## 6083259 1597 N745SW 73 59 -12 0
## Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode
## 6083254 HOU TPA 781 5 10 0 <NA>
## 6083255 HOU TPA 781 5 11 0 <NA>
## 6083256 HOU TPA 781 4 9 0 <NA>
## 6083257 HOU TUL 453 4 14 0 <NA>
## 6083258 HOU TUL 453 3 9 0 <NA>
## 6083259 HOU TUL 453 3 11 0 <NA>
## Diverted New_Carrier
## 6083254 0 Southwest
## 6083255 0 Southwest
## 6083256 0 Southwest
## 6083257 0 Southwest
## 6083258 0 Southwest
## 6083259 0 Southwest
*Select
# Select only four variables below
my_new_variable<-hflights%>%dplyr::select(UniqueCarrier,AirTime,ArrTime,DepDelay)
head(my_new_variable)
## UniqueCarrier AirTime ArrTime DepDelay
## 5424 AA 40 1500 0
## 5425 AA 45 1501 1
## 5426 AA 48 1502 -8
## 5427 AA 39 1513 3
## 5428 AA 44 1507 5
## 5429 AA 45 1503 -1
# Select all variables, except variable 5 to9
my_variable1<-hflights %>% dplyr::select(-c(5:9))
head(my_variable1)
## Year Month DayofMonth DayOfWeek ActualElapsedTime AirTime ArrDelay
## 5424 2011 1 1 6 60 40 -10
## 5425 2011 1 2 7 60 45 -9
## 5426 2011 1 3 1 70 48 -8
## 5427 2011 1 4 2 70 39 3
## 5428 2011 1 5 3 62 44 -3
## 5429 2011 1 6 4 64 45 -7
## DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled
## 5424 0 IAH DFW 224 7 13 0
## 5425 1 IAH DFW 224 6 9 0
## 5426 -8 IAH DFW 224 5 17 0
## 5427 3 IAH DFW 224 9 22 0
## 5428 5 IAH DFW 224 9 9 0
## 5429 -1 IAH DFW 224 6 13 0
## CancellationCode Diverted New_Carrier
## 5424 0 American
## 5425 0 American
## 5426 0 American
## 5427 0 American
## 5428 0 American
## 5429 0 American
# Select the first four variables
my_first_fourvariable<-hflights %>% dplyr::select(Year:DayOfWeek)
head(my_first_fourvariable)
## Year Month DayofMonth DayOfWeek
## 5424 2011 1 1 6
## 5425 2011 1 2 7
## 5426 2011 1 3 1
## 5427 2011 1 4 2
## 5428 2011 1 5 3
## 5429 2011 1 6 4
# select all columns end with "Time"
my_time_variable<-hflights %>% dplyr::select(ends_with("Time"))
head(my_time_variable)
## DepTime ArrTime ActualElapsedTime AirTime
## 5424 1400 1500 60 40
## 5425 1401 1501 60 45
## 5426 1352 1502 70 48
## 5427 1403 1513 70 39
## 5428 1405 1507 62 44
## 5429 1359 1503 64 45
# Can be 'starts_with("T")', 'contains("Time")'
my_combine<-hflights %>% dplyr::select(contains("Taxi"),Distance)
head(my_combine)
## TaxiIn TaxiOut Distance
## 5424 7 13 224
## 5425 6 9 224
## 5426 5 17 224
## 5427 9 22 224
## 5428 9 9 224
## 5429 6 13 224
*Mutate
# Create a avarage speed
my_speed<-hflights %>% dplyr::mutate(mean_speed=(Distance/AirTime)*60)
# can be creating many new variables at the same time
head(my_speed)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N576AA 60 40 -10 0 IAH DFW 224
## 2 N557AA 60 45 -9 1 IAH DFW 224
## 3 N541AA 70 48 -8 -8 IAH DFW 224
## 4 N403AA 70 39 3 3 IAH DFW 224
## 5 N492AA 62 44 -3 5 IAH DFW 224
## 6 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted New_Carrier
## 1 7 13 0 0 American
## 2 6 9 0 0 American
## 3 5 17 0 0 American
## 4 9 22 0 0 American
## 5 9 9 0 0 American
## 6 6 13 0 0 American
## mean_speed
## 1 336.0000
## 2 298.6667
## 3 280.0000
## 4 344.6154
## 5 305.4545
## 6 298.6667
*To be continued
Filter
# find all flights operated by American, Jetblue
new_flights<- hflights %>% filter(New_Carrier %in% c("American","Delta","JetBlue"))
tail(new_flights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 6575 2011 12 30 5 1554 1849 DL
## 6576 2011 12 30 5 557 856 DL
## 6577 2011 12 31 6 1528 1812 DL
## 6578 2011 12 31 6 709 1005 DL
## 6579 2011 12 31 6 1423 1719 DL
## 6580 2011 12 31 6 556 853 DL
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 6575 1846 N909DL 115 92 -10 -5 HOU
## 6576 2010 N968DL 119 96 -9 -3 IAH
## 6577 8 N376NW 104 90 -21 -4 IAH
## 6578 810 N337NW 116 94 -5 2 IAH
## 6579 1590 N914DL 116 92 278 283 IAH
## 6580 2010 N987DL 117 93 -12 -4 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 6575 ATL 696 9 14 0 0
## 6576 ATL 689 10 13 0 0
## 6577 ATL 689 4 10 0 0
## 6578 ATL 689 11 11 0 0
## 6579 ATL 689 7 17 0 0
## 6580 ATL 689 8 16 0 0
## New_Carrier
## 6575 Delta
## 6576 Delta
## 6577 Delta
## 6578 Delta
## 6579 Delta
## 6580 Delta
# Create a date column from Year,Month and date columns
new_date<- hflights %>% dplyr::mutate(Date=paste(DayofMonth,Month,Year,sep = "-")) %>% dplyr::select(20:23)
head(new_date)
## CancellationCode Diverted New_Carrier Date
## 1 0 American 1-1-2011
## 2 0 American 2-1-2011
## 3 0 American 3-1-2011
## 4 0 American 4-1-2011
## 5 0 American 5-1-2011
## 6 0 American 6-1-2011
# How many airplanes only flew to one destination
only_dest<- hflights %>% group_by(TailNum) %>% summarise(ndest=n_distinct(Dest)) %>% filter(ndest==1) %>% summarise(n_airplane=n())
dim(only_dest)
## [1] 1 1
# Find the most visited destinations for each carrier
most_dest<- hflights%>% group_by(UniqueCarrier,Dest) %>% summarise(n=n()) %>% mutate(rank=rank(desc(n))) %>% filter(rank==1)
dim(most_dest)
## [1] 15 4
head(most_dest)
## Source: local data frame [6 x 4]
## Groups: UniqueCarrier [6]
##
## UniqueCarrier Dest n rank
## <chr> <chr> <int> <dbl>
## 1 AA DFW 2105 1
## 2 AS SEA 365 1
## 3 B6 JFK 695 1
## 4 CO EWR 3924 1
## 5 DL ATL 2396 1
## 6 EV DTW 851 1