Load packages:

library(nycflights13)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(stringr)
library(chron)
## 
## Attaching package: 'chron'
## The following objects are masked from 'package:lubridate':
## 
##     days, hours, minutes, seconds, years

So this dataset contains data flights departed from NYC for 2013

336776 obs. of 19 variables

Things I want to do:

time line graph of dep_timeand sched_dep_time

data1<-nycflights13::flights

str(data1)
## Classes 'tbl_df', 'tbl' and 'data.frame':    336776 obs. of  19 variables:
##  $ year          : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int  517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int  515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num  2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int  830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int  819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num  11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr  "UA" "UA" "AA" "B6" ...
##  $ flight        : int  1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr  "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr  "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr  "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num  227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num  1400 1416 1089 1576 762 ...
##  $ hour          : num  5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num  15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary(data1)
##       year          month             day           dep_time   
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400  
##                                                  NA's   :8255  
##  sched_dep_time   dep_delay          arr_time    sched_arr_time
##  Min.   : 106   Min.   : -43.00   Min.   :   1   Min.   :   1  
##  1st Qu.: 906   1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124  
##  Median :1359   Median :  -2.00   Median :1535   Median :1556  
##  Mean   :1344   Mean   :  12.64   Mean   :1502   Mean   :1536  
##  3rd Qu.:1729   3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945  
##  Max.   :2359   Max.   :1301.00   Max.   :2400   Max.   :2359  
##                 NA's   :8255      NA's   :8713                 
##    arr_delay          carrier              flight       tailnum         
##  Min.   : -86.000   Length:336776      Min.   :   1   Length:336776     
##  1st Qu.: -17.000   Class :character   1st Qu.: 553   Class :character  
##  Median :  -5.000   Mode  :character   Median :1496   Mode  :character  
##  Mean   :   6.895                      Mean   :1972                     
##  3rd Qu.:  14.000                      3rd Qu.:3465                     
##  Max.   :1272.000                      Max.   :8500                     
##  NA's   :9430                                                           
##     origin              dest              air_time        distance   
##  Length:336776      Length:336776      Min.   : 20.0   Min.   :  17  
##  Class :character   Class :character   1st Qu.: 82.0   1st Qu.: 502  
##  Mode  :character   Mode  :character   Median :129.0   Median : 872  
##                                        Mean   :150.7   Mean   :1040  
##                                        3rd Qu.:192.0   3rd Qu.:1389  
##                                        Max.   :695.0   Max.   :4983  
##                                        NA's   :9430                  
##       hour           minute        time_hour                  
##  Min.   : 1.00   Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 9.00   1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :13.00   Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :13.18   Mean   :26.23   Mean   :2013-07-03 05:02:36  
##  3rd Qu.:17.00   3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :23.00   Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 
dates.col<- c("dep_time","sched_dep_time","arr_time", "sched_arr_time","air_time")

data1[dates.col]<-sapply(data1[dates.col], function(x) str_pad(x, 4, pad = 0))

# data1[dates.col] <- sapply(data1[dates.col], function(x) as.character(x))

# data1[dates.col] <- sapply(data1[dates.col], function(x) gsub("(\\d\\d)(\\d\\d)", "\\1:\\2", x))

# data1[dates.col]<- sapply(data1[dates.col], function(x) hm(x))

# data1$dep_time<-hm(data1$dep_time)
# data1$sched_dep_time<-hm(data1$sched_dep_time)
# data1$arr_time<-hm(data1$arr_time)
# data1$sched_arr_time<-hm(data1$sched_arr_time)
# 
# data1$dep.delay.1<- data1$dep_time-data1$sched_dep_time
# data1$arr.delay.1<-data1$arr_time-data1$sched_arr_time ###its showing wrong duration ; its not working
# 

All origination flights:

unique(data1$origin)
## [1] "EWR" "LGA" "JFK"

All destination flights:

unique(data1$dest)
##   [1] "IAH" "MIA" "BQN" "ATL" "ORD" "FLL" "IAD" "MCO" "PBI" "TPA" "LAX"
##  [12] "SFO" "DFW" "BOS" "LAS" "MSP" "DTW" "RSW" "SJU" "PHX" "BWI" "CLT"
##  [23] "BUF" "DEN" "SNA" "MSY" "SLC" "XNA" "MKE" "SEA" "ROC" "SYR" "SRQ"
##  [34] "RDU" "CMH" "JAX" "CHS" "MEM" "PIT" "SAN" "DCA" "CLE" "STL" "MYR"
##  [45] "JAC" "MDW" "HNL" "BNA" "AUS" "BTV" "PHL" "STT" "EGE" "AVL" "PWM"
##  [56] "IND" "SAV" "CAK" "HOU" "LGB" "DAY" "ALB" "BDL" "MHT" "MSN" "GSO"
##  [67] "CVG" "BUR" "RIC" "GSP" "GRR" "MCI" "ORF" "SAT" "SDF" "PDX" "SJC"
##  [78] "OMA" "CRW" "OAK" "SMF" "TUL" "TYS" "OKC" "PVD" "DSM" "PSE" "BHM"
##  [89] "CAE" "HDN" "BZN" "MTJ" "EYW" "PSP" "ACK" "BGR" "ABQ" "ILM" "MVY"
## [100] "SBN" "LEX" "CHO" "TVC" "ANC" "LGA"
length(unique(data1$dest))
## [1] 105

All carrier who flew out from NYK:

unique(data1$carrier)
##  [1] "UA" "AA" "B6" "DL" "EV" "MQ" "US" "WN" "VX" "FL" "AS" "9E" "F9" "HA"
## [15] "YV" "OO"
length(unique(data1$carrier))
## [1] 16
# summary(data1$dep_delay)


data1$origin<-as.factor(data1$origin)
data1$dest<-as.factor(data1$dest)
data1$carrier<-as.factor(data1$carrier)

mean.dep.delay<-data1 %>% select(carrier, origin, dest, dep_delay) %>% group_by(carrier) %>% summarise(mean.delay=mean(dep_delay, na.rm=T))

data1<- data1 %>% group_by(carrier) %>% mutate(mean.delay=mean(dep_delay, na.rm=T))



data1$carrier<-factor(data1$carrier, levels=data1$carrier[order(data1$mean.delay)])
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
ggplot(data1, aes(carrier))+stat_summary_bin(aes(y=dep_delay), fun.y="mean", geom = "bar", na.rm = T)+coord_flip()
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

# data1 %>% filter(carrier=="F9" & dep_delay>0)

Average Departure Dealy by Carrier

SO there are 16 Carrier that fly out from NYC. Out of which US airlines has the least average deplay time of less than 5 minutes wheras carrier F9 has the biggest average departure flight delay of 20 minutes for year 2013.

ggplot(data1, aes(carrier,dep_delay))+geom_boxplot()+coord_flip()
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning: Removed 8255 rows containing non-finite values (stat_boxplot).

We can clearly seee from the above box plot for carrier vs their departure delay time that US airline has less variability compared to other carriers. Another reason why US has better average departure delay than other is because they leave early then other too.

ggplot(data1, aes(carrier,dep_delay))+geom_boxplot()+coord_cartesian(ylim = c(-45,50))
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning: Removed 8255 rows containing non-finite values (stat_boxplot).

data1%>% filter(carrier=="US") %>% select(dep_delay) %>% summarise(median(dep_delay,na.rm=T))
## Adding missing grouping variables: `carrier`
## # A tibble: 1 × 2
##   carrier `median(dep_delay, na.rm = T)`
##    <fctr>                          <dbl>
## 1      US                             -4

When zoomed-in the boxplot clearly shows that the average figures are skewed due to flihts leaving to their destination earlier than required. FOr US carrier the median is arounf -4 minutes whereas mean is around 4 minutes. This is the not the true refelction of delay departure. I think we need to remove periods where carrier were leaving earlier than required.

rm.mean.dep.delay<- data1 %>% filter(dep_delay>=0) %>% group_by(carrier) %>% summarise(mean=mean(dep_delay, na.rm=T))

data1 %>% filter(dep_delay>=0) %>% ggplot(aes(carrier,dep_delay))+geom_boxplot()
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

data1 %>% filter(dep_delay>=0) %>% ggplot(aes(carrier))+stat_summary_bin(aes(y=dep_delay), fun.y = "mean", geom="bar", na.rm=T)
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

rm.mean.dep.delay$carrier <- factor(rm.mean.dep.delay$carrier, levels = rm.mean.dep.delay$carrier[order(rm.mean.dep.delay$mean)])

ggplot(rm.mean.dep.delay, aes(carrier,mean))+geom_bar(stat="identity")+coord_flip()

Now after removing -ve delay departures we see that best dep dealy on average is UA. US comes at the 4th rank. Whereas OO is the worst.

data1%>% filter(dep_delay>=0) %>% ggplot(aes(dep_delay))+geom_histogram(binwidth=15, na.rm = T)+xlim(0,500)+ylim(0,35000)

data1$date<-as.Date(data1$time_hour)



data1 %>% filter(!is.na(dep_delay)) %>% group_by(date, flight) %>% summarise(median=median(dep_delay))%>% ggplot(aes(date,median))+geom_point(aes(alpha=.1))

data1 %>% filter(!is.na(dep_delay)) %>% group_by(date) %>% summarise(mean=mean(dep_delay))%>% ggplot(aes(date,mean))+geom_point(aes(alpha=.1))

data1 %>% filter(!is.na(dep_delay)) %>% group_by(date, flight) %>% summarise(mean=mean(dep_delay)) %>% ggplot(aes(date,mean))+geom_point(aes(alpha=.1))+
        labs(title="Departure Dealy - average delay minutes vs dates", x="Dates")

arr.delay<-data1 %>% filter(!is.na(arr_delay)) %>% group_by(date) %>% summarise(mean=mean(arr_delay), counts=n()) %>% arrange(desc(date))
ggplot(arr.delay, aes(date,mean))+geom_point()+
        labs(title="Departure Dealy - median minutes vs dates", x="Dates")+scale_size_area()

# ggplot(arr.delay, aes(date, mean, col=dest))+geom_point()+facet_wrap(~carrier)

vlookup

airports<-nycflights13::airlines

str(airports)
## Classes 'tbl_df', 'tbl' and 'data.frame':    16 obs. of  2 variables:
##  $ carrier: chr  "9E" "AA" "AS" "B6" ...
##  $ name   : chr  "Endeavor Air Inc." "American Airlines Inc." "Alaska Airlines Inc." "JetBlue Airways" ...
data1$carrier<-as.character(data1$carrier)

data1<-left_join(data1, airports, by="carrier")

data1 <- data1 %>% rename(Airlines=name)
planes<-nycflights13::planes

min and max as per carrier

min.max.delay<-data1 %>% group_by(carrier) %>% summarise_each(funs(min(., na.rm=T), max(., na.rm=T)), dep_delay, arr_delay) %>% 
        arrange(dep_delay_max)

head(min.max.delay)
## # A tibble: 6 × 5
##   carrier dep_delay_min arr_delay_min dep_delay_max arr_delay_max
##     <chr>         <dbl>         <dbl>         <dbl>         <dbl>
## 1      OO           -14           -26           154           157
## 2      AS           -21           -74           225           198
## 3      YV           -16           -46           387           381
## 4      WN           -13           -58           471           453
## 5      UA           -20           -75           483           455
## 6      US           -19           -70           500           492

Flight count by month

data1 %>% group_by(month) %>% tally()
## # A tibble: 12 × 2
##    month     n
##    <int> <int>
## 1      1 27004
## 2      2 24951
## 3      3 28834
## 4      4 28330
## 5      5 28796
## 6      6 28243
## 7      7 29425
## 8      8 29327
## 9      9 27574
## 10    10 28889
## 11    11 27268
## 12    12 28135
data1 %>% group_by(month) %>% tally() %>% mutate(change=n-lag(n))
## # A tibble: 12 × 3
##    month     n change
##    <int> <int>  <int>
## 1      1 27004     NA
## 2      2 24951  -2053
## 3      3 28834   3883
## 4      4 28330   -504
## 5      5 28796    466
## 6      6 28243   -553
## 7      7 29425   1182
## 8      8 29327    -98
## 9      9 27574  -1753
## 10    10 28889   1315
## 11    11 27268  -1621
## 12    12 28135    867
weather<-nycflights13::weather

weather$date<-as.Date(weather$time_hour)

dep.delay.weather<-data1 %>% group_by(date,hour) %>% filter(!is.na(dep_delay)) %>% summarise(delay.mean=mean(dep_delay), n=n()) %>% filter(n>15) %>%
  left_join(weather)
## Joining, by = c("date", "hour")
ggplot(dep.delay.weather, aes(delay.mean,visib))+geom_jitter()
## Warning: Removed 18 rows containing missing values (geom_point).

ggplot(dep.delay.weather, aes(wind_gust,delay.mean,))+geom_jitter()+geom_smooth()+coord_cartesian(xlim = c(0,55), ylim = c(-5,200))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 21 rows containing non-finite values (stat_smooth).
## Warning: Removed 21 rows containing missing values (geom_point).

ggplot(dep.delay.weather, aes(pressure,delay.mean,))+geom_jitter()+geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1883 rows containing non-finite values (stat_smooth).
## Warning: Removed 1883 rows containing missing values (geom_point).

qplot(is.na(wind_gust), delay.mean, data = dep.delay.weather, geom = "boxplot")