Load packages:
library(nycflights13)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(stringr)
library(chron)
##
## Attaching package: 'chron'
## The following objects are masked from 'package:lubridate':
##
## days, hours, minutes, seconds, years
So this dataset contains data flights departed from NYC for 2013
336776 obs. of 19 variables
time line graph of dep_timeand sched_dep_time
data1<-nycflights13::flights
str(data1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 336776 obs. of 19 variables:
## $ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr "UA" "UA" "AA" "B6" ...
## $ flight : int 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num 1400 1416 1089 1576 762 ...
## $ hour : num 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary(data1)
## year month day dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907
## Median :2013 Median : 7.000 Median :16.00 Median :1401
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400
## NA's :8255
## sched_dep_time dep_delay arr_time sched_arr_time
## Min. : 106 Min. : -43.00 Min. : 1 Min. : 1
## 1st Qu.: 906 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124
## Median :1359 Median : -2.00 Median :1535 Median :1556
## Mean :1344 Mean : 12.64 Mean :1502 Mean :1536
## 3rd Qu.:1729 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945
## Max. :2359 Max. :1301.00 Max. :2400 Max. :2359
## NA's :8255 NA's :8713
## arr_delay carrier flight tailnum
## Min. : -86.000 Length:336776 Min. : 1 Length:336776
## 1st Qu.: -17.000 Class :character 1st Qu.: 553 Class :character
## Median : -5.000 Mode :character Median :1496 Mode :character
## Mean : 6.895 Mean :1972
## 3rd Qu.: 14.000 3rd Qu.:3465
## Max. :1272.000 Max. :8500
## NA's :9430
## origin dest air_time distance
## Length:336776 Length:336776 Min. : 20.0 Min. : 17
## Class :character Class :character 1st Qu.: 82.0 1st Qu.: 502
## Mode :character Mode :character Median :129.0 Median : 872
## Mean :150.7 Mean :1040
## 3rd Qu.:192.0 3rd Qu.:1389
## Max. :695.0 Max. :4983
## NA's :9430
## hour minute time_hour
## Min. : 1.00 Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :13.00 Median :29.00 Median :2013-07-03 10:00:00
## Mean :13.18 Mean :26.23 Mean :2013-07-03 05:02:36
## 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :23.00 Max. :59.00 Max. :2013-12-31 23:00:00
##
dates.col<- c("dep_time","sched_dep_time","arr_time", "sched_arr_time","air_time")
data1[dates.col]<-sapply(data1[dates.col], function(x) str_pad(x, 4, pad = 0))
# data1[dates.col] <- sapply(data1[dates.col], function(x) as.character(x))
# data1[dates.col] <- sapply(data1[dates.col], function(x) gsub("(\\d\\d)(\\d\\d)", "\\1:\\2", x))
# data1[dates.col]<- sapply(data1[dates.col], function(x) hm(x))
# data1$dep_time<-hm(data1$dep_time)
# data1$sched_dep_time<-hm(data1$sched_dep_time)
# data1$arr_time<-hm(data1$arr_time)
# data1$sched_arr_time<-hm(data1$sched_arr_time)
#
# data1$dep.delay.1<- data1$dep_time-data1$sched_dep_time
# data1$arr.delay.1<-data1$arr_time-data1$sched_arr_time ###its showing wrong duration ; its not working
#
All origination flights:
unique(data1$origin)
## [1] "EWR" "LGA" "JFK"
All destination flights:
unique(data1$dest)
## [1] "IAH" "MIA" "BQN" "ATL" "ORD" "FLL" "IAD" "MCO" "PBI" "TPA" "LAX"
## [12] "SFO" "DFW" "BOS" "LAS" "MSP" "DTW" "RSW" "SJU" "PHX" "BWI" "CLT"
## [23] "BUF" "DEN" "SNA" "MSY" "SLC" "XNA" "MKE" "SEA" "ROC" "SYR" "SRQ"
## [34] "RDU" "CMH" "JAX" "CHS" "MEM" "PIT" "SAN" "DCA" "CLE" "STL" "MYR"
## [45] "JAC" "MDW" "HNL" "BNA" "AUS" "BTV" "PHL" "STT" "EGE" "AVL" "PWM"
## [56] "IND" "SAV" "CAK" "HOU" "LGB" "DAY" "ALB" "BDL" "MHT" "MSN" "GSO"
## [67] "CVG" "BUR" "RIC" "GSP" "GRR" "MCI" "ORF" "SAT" "SDF" "PDX" "SJC"
## [78] "OMA" "CRW" "OAK" "SMF" "TUL" "TYS" "OKC" "PVD" "DSM" "PSE" "BHM"
## [89] "CAE" "HDN" "BZN" "MTJ" "EYW" "PSP" "ACK" "BGR" "ABQ" "ILM" "MVY"
## [100] "SBN" "LEX" "CHO" "TVC" "ANC" "LGA"
length(unique(data1$dest))
## [1] 105
All carrier who flew out from NYK:
unique(data1$carrier)
## [1] "UA" "AA" "B6" "DL" "EV" "MQ" "US" "WN" "VX" "FL" "AS" "9E" "F9" "HA"
## [15] "YV" "OO"
length(unique(data1$carrier))
## [1] 16
# summary(data1$dep_delay)
data1$origin<-as.factor(data1$origin)
data1$dest<-as.factor(data1$dest)
data1$carrier<-as.factor(data1$carrier)
mean.dep.delay<-data1 %>% select(carrier, origin, dest, dep_delay) %>% group_by(carrier) %>% summarise(mean.delay=mean(dep_delay, na.rm=T))
data1<- data1 %>% group_by(carrier) %>% mutate(mean.delay=mean(dep_delay, na.rm=T))
data1$carrier<-factor(data1$carrier, levels=data1$carrier[order(data1$mean.delay)])
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
ggplot(data1, aes(carrier))+stat_summary_bin(aes(y=dep_delay), fun.y="mean", geom = "bar", na.rm = T)+coord_flip()
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
# data1 %>% filter(carrier=="F9" & dep_delay>0)
SO there are 16 Carrier that fly out from NYC. Out of which US airlines has the least average deplay time of less than 5 minutes wheras carrier F9 has the biggest average departure flight delay of 20 minutes for year 2013.
ggplot(data1, aes(carrier,dep_delay))+geom_boxplot()+coord_flip()
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning: Removed 8255 rows containing non-finite values (stat_boxplot).
We can clearly seee from the above box plot for carrier vs their departure delay time that US airline has less variability compared to other carriers. Another reason why US has better average departure delay than other is because they leave early then other too.
ggplot(data1, aes(carrier,dep_delay))+geom_boxplot()+coord_cartesian(ylim = c(-45,50))
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning: Removed 8255 rows containing non-finite values (stat_boxplot).
data1%>% filter(carrier=="US") %>% select(dep_delay) %>% summarise(median(dep_delay,na.rm=T))
## Adding missing grouping variables: `carrier`
## # A tibble: 1 × 2
## carrier `median(dep_delay, na.rm = T)`
## <fctr> <dbl>
## 1 US -4
When zoomed-in the boxplot clearly shows that the average figures are skewed due to flihts leaving to their destination earlier than required. FOr US carrier the median is arounf -4 minutes whereas mean is around 4 minutes. This is the not the true refelction of delay departure. I think we need to remove periods where carrier were leaving earlier than required.
rm.mean.dep.delay<- data1 %>% filter(dep_delay>=0) %>% group_by(carrier) %>% summarise(mean=mean(dep_delay, na.rm=T))
data1 %>% filter(dep_delay>=0) %>% ggplot(aes(carrier,dep_delay))+geom_boxplot()
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
data1 %>% filter(dep_delay>=0) %>% ggplot(aes(carrier))+stat_summary_bin(aes(y=dep_delay), fun.y = "mean", geom="bar", na.rm=T)
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
rm.mean.dep.delay$carrier <- factor(rm.mean.dep.delay$carrier, levels = rm.mean.dep.delay$carrier[order(rm.mean.dep.delay$mean)])
ggplot(rm.mean.dep.delay, aes(carrier,mean))+geom_bar(stat="identity")+coord_flip()
Now after removing -ve delay departures we see that best dep dealy on average is UA. US comes at the 4th rank. Whereas OO is the worst.
data1%>% filter(dep_delay>=0) %>% ggplot(aes(dep_delay))+geom_histogram(binwidth=15, na.rm = T)+xlim(0,500)+ylim(0,35000)
data1$date<-as.Date(data1$time_hour)
data1 %>% filter(!is.na(dep_delay)) %>% group_by(date, flight) %>% summarise(median=median(dep_delay))%>% ggplot(aes(date,median))+geom_point(aes(alpha=.1))
data1 %>% filter(!is.na(dep_delay)) %>% group_by(date) %>% summarise(mean=mean(dep_delay))%>% ggplot(aes(date,mean))+geom_point(aes(alpha=.1))
data1 %>% filter(!is.na(dep_delay)) %>% group_by(date, flight) %>% summarise(mean=mean(dep_delay)) %>% ggplot(aes(date,mean))+geom_point(aes(alpha=.1))+
labs(title="Departure Dealy - average delay minutes vs dates", x="Dates")
arr.delay<-data1 %>% filter(!is.na(arr_delay)) %>% group_by(date) %>% summarise(mean=mean(arr_delay), counts=n()) %>% arrange(desc(date))
ggplot(arr.delay, aes(date,mean))+geom_point()+
labs(title="Departure Dealy - median minutes vs dates", x="Dates")+scale_size_area()
# ggplot(arr.delay, aes(date, mean, col=dest))+geom_point()+facet_wrap(~carrier)
airports<-nycflights13::airlines
str(airports)
## Classes 'tbl_df', 'tbl' and 'data.frame': 16 obs. of 2 variables:
## $ carrier: chr "9E" "AA" "AS" "B6" ...
## $ name : chr "Endeavor Air Inc." "American Airlines Inc." "Alaska Airlines Inc." "JetBlue Airways" ...
data1$carrier<-as.character(data1$carrier)
data1<-left_join(data1, airports, by="carrier")
data1 <- data1 %>% rename(Airlines=name)
planes<-nycflights13::planes
min.max.delay<-data1 %>% group_by(carrier) %>% summarise_each(funs(min(., na.rm=T), max(., na.rm=T)), dep_delay, arr_delay) %>%
arrange(dep_delay_max)
head(min.max.delay)
## # A tibble: 6 × 5
## carrier dep_delay_min arr_delay_min dep_delay_max arr_delay_max
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 OO -14 -26 154 157
## 2 AS -21 -74 225 198
## 3 YV -16 -46 387 381
## 4 WN -13 -58 471 453
## 5 UA -20 -75 483 455
## 6 US -19 -70 500 492
data1 %>% group_by(month) %>% tally()
## # A tibble: 12 × 2
## month n
## <int> <int>
## 1 1 27004
## 2 2 24951
## 3 3 28834
## 4 4 28330
## 5 5 28796
## 6 6 28243
## 7 7 29425
## 8 8 29327
## 9 9 27574
## 10 10 28889
## 11 11 27268
## 12 12 28135
data1 %>% group_by(month) %>% tally() %>% mutate(change=n-lag(n))
## # A tibble: 12 × 3
## month n change
## <int> <int> <int>
## 1 1 27004 NA
## 2 2 24951 -2053
## 3 3 28834 3883
## 4 4 28330 -504
## 5 5 28796 466
## 6 6 28243 -553
## 7 7 29425 1182
## 8 8 29327 -98
## 9 9 27574 -1753
## 10 10 28889 1315
## 11 11 27268 -1621
## 12 12 28135 867
weather<-nycflights13::weather
weather$date<-as.Date(weather$time_hour)
dep.delay.weather<-data1 %>% group_by(date,hour) %>% filter(!is.na(dep_delay)) %>% summarise(delay.mean=mean(dep_delay), n=n()) %>% filter(n>15) %>%
left_join(weather)
## Joining, by = c("date", "hour")
ggplot(dep.delay.weather, aes(delay.mean,visib))+geom_jitter()
## Warning: Removed 18 rows containing missing values (geom_point).
ggplot(dep.delay.weather, aes(wind_gust,delay.mean,))+geom_jitter()+geom_smooth()+coord_cartesian(xlim = c(0,55), ylim = c(-5,200))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 21 rows containing non-finite values (stat_smooth).
## Warning: Removed 21 rows containing missing values (geom_point).
ggplot(dep.delay.weather, aes(pressure,delay.mean,))+geom_jitter()+geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1883 rows containing non-finite values (stat_smooth).
## Warning: Removed 1883 rows containing missing values (geom_point).
qplot(is.na(wind_gust), delay.mean, data = dep.delay.weather, geom = "boxplot")