The purpose of this Homework is to review and summarize data on NYC Flights in 2013. I will look at this review in the POV of a primary investigator setting up for grant funding to do research.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## 다음의 패키지를 부착합니다: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(dplyr)
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): min에 전달되는 인자들 중 누락이 있어 Inf를 반환
## 합니다
## Warning in FUN(newX[, i], ...): max에 전달되는 인자들 중 누락이 있어 -Inf를 반환
## 합니다
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
# check for N/A
na.cols <- which(colSums(is.na(flights)) >0)
sort(colSums(sapply(flights[na.cols], is.na)),decreasing = TRUE)
## arr_delay air_time arr_time dep_time dep_delay tailnum
## 9430 9430 8713 8255 8255 2512
paste('Number of columns with no values:', length(na.cols))
## [1] "Number of columns with no values: 6"
# What can we do with these N/A? Is it necessary for analysis?
flights_nona <- flights %>%
filter(!is.na(distance) & !is.na(air_time))
which(is.na(flights_nona), arr.ind=TRUE)
## row col
str(flights_nona)
## tibble [327,346 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:327346] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:327346] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:327346] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:327346] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:327346] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:327346] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:327346] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:327346] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:327346] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:327346] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:327346] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:327346] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:327346] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:327346] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:327346] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:327346] 1400 1416 1089 1576 762 ...
## $ hour : num [1:327346] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:327346] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:327346], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary(flights_nona)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 500
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 905
## Median :2013 Median : 7.000 Median :16.00 Median :1400 Median :1355
## Mean :2013 Mean : 6.565 Mean :15.74 Mean :1349 Mean :1340
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1122 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1554 Median : -5.000
## Mean : 12.56 Mean :1502 Mean :1533 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1944 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## carrier flight tailnum origin
## Length:327346 Min. : 1 Length:327346 Length:327346
## Class :character 1st Qu.: 544 Class :character Class :character
## Mode :character Median :1467 Mode :character Mode :character
## Mean :1943
## 3rd Qu.:3412
## Max. :8500
## dest air_time distance hour
## Length:327346 Min. : 20.0 Min. : 80 Min. : 5.00
## Class :character 1st Qu.: 82.0 1st Qu.: 509 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 888 Median :13.00
## Mean :150.7 Mean :1048 Mean :13.14
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-05 06:00:00.00
## Median :29.00 Median :2013-07-04 09:00:00.00
## Mean :26.23 Mean :2013-07-03 17:56:45.44
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 18:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00
Your assignment is to create one plot to visualize one aspect of this dataset. The plot may be any type we have covered so far in this class (bargraphs, scatterplots, boxplots, histograms, treemaps, heatmaps, streamgraphs, or alluvials)
ggplot(flights_nona, aes(x=distance, y=air_time, color=carrier))+
geom_point(size = 1.2)+
geom_smooth(method=lm, color="darkred")+
labs(title="Distance vs Air Time for Flights in 2013 by Carrier",
x="Distance", y="Air Time", color="Carrier \n Code")
## `geom_smooth()` using formula 'y ~ x'
flights_large <- flights_nona %>%
filter(distance>4500)
summary(flights_large)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 641 Min. : 900
## 1st Qu.:2013 1st Qu.: 3.000 1st Qu.: 8.00 1st Qu.: 952 1st Qu.:1000
## Median :2013 Median : 6.000 Median :16.00 Median :1001 Median :1000
## Mean :2013 Mean : 6.408 Mean :15.74 Mean :1123 Mean :1124
## 3rd Qu.:2013 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:1331 3rd Qu.:1329
## Max. :2013 Max. :12.000 Max. :31.00 Max. :1755 Max. :1344
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -16.000 Min. : 34 Min. :1430 Min. : -70.000
## 1st Qu.: -5.000 1st Qu.:1448 1st Qu.:1510 1st Qu.: -22.000
## Median : -1.000 Median :1546 Median :1540 Median : -7.000
## Mean : 9.315 Mean :1633 Mean :1644 Mean : -1.365
## 3rd Qu.: 5.000 3rd Qu.:1810 3rd Qu.:1813 3rd Qu.: 6.000
## Max. :1301.000 Max. :2213 Max. :1944 Max. :1272.000
## carrier flight tailnum origin
## Length:701 Min. :15.00 Length:701 Length:701
## Class :character 1st Qu.:15.00 Class :character Class :character
## Mode :character Median :15.00 Mode :character Mode :character
## Mean :32.56
## 3rd Qu.:51.00
## Max. :51.00
## dest air_time distance hour
## Length:701 Min. :562.0 Min. :4963 Min. : 9.00
## Class :character 1st Qu.:602.0 1st Qu.:4963 1st Qu.:10.00
## Mode :character Median :616.0 Median :4963 Median :10.00
## Mean :617.4 Mean :4973 Mean :11.07
## 3rd Qu.:631.0 3rd Qu.:4983 3rd Qu.:13.00
## Max. :695.0 Max. :4983 Max. :13.00
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 09:00:00.00
## 1st Qu.: 0.00 1st Qu.:2013-03-30 10:00:00.00
## Median :25.00 Median :2013-06-26 10:00:00.00
## Mean :16.88 Mean :2013-06-28 21:25:14.98
## 3rd Qu.:30.00 3rd Qu.:2013-09-25 10:00:00.00
## Max. :44.00 Max. :2013-12-31 09:00:00.00
ggplot(flights_large, aes(x=distance, y=air_time, color=carrier))+
geom_point(size = 1.2)
flights_ua <- flights_nona[flights_nona$carrier == "UA" & flights_nona$distance > 4500,]
flights_ha <- flights_nona %>%
filter(carrier=="HA" & distance>4500)
#flights_nona[flights_nona$carrier == "HA" & flights_nona$distance > 4500,]
summary(flights_ua)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 921 Min. : 930
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:1324 1st Qu.:1325
## Median :2013 Median : 7.000 Median :16.00 Median :1331 Median :1329
## Mean :2013 Mean : 6.535 Mean :15.74 Mean :1289 Mean :1267
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1341 3rd Qu.:1335
## Max. :2013 Max. :12.000 Max. :31.00 Max. :1755 Max. :1344
## dep_delay arr_time sched_arr_time arr_delay
## Min. :-13.00 Min. : 34 Min. :1527 Min. :-70.000
## 1st Qu.: -2.00 1st Qu.:1748 1st Qu.:1801 1st Qu.:-15.000
## Median : 2.00 Median :1807 Median :1813 Median : -3.000
## Mean : 13.52 Mean :1785 Mean :1790 Mean : 3.922
## 3rd Qu.: 11.50 3rd Qu.:1846 3rd Qu.:1836 3rd Qu.: 7.500
## Max. :290.00 Max. :2213 Max. :1944 Max. :299.000
## carrier flight tailnum origin
## Length:359 Min. :15 Length:359 Length:359
## Class :character 1st Qu.:15 Class :character Class :character
## Mode :character Median :15 Mode :character Mode :character
## Mean :15
## 3rd Qu.:15
## Max. :15
## dest air_time distance hour
## Length:359 Min. :562.0 Min. :4963 Min. : 9.00
## Class :character 1st Qu.:599.0 1st Qu.:4963 1st Qu.:13.00
## Mode :character Median :611.0 Median :4963 Median :13.00
## Mean :612.1 Mean :4963 Mean :12.36
## 3rd Qu.:622.5 3rd Qu.:4963 3rd Qu.:13.00
## Max. :695.0 Max. :4963 Max. :13.00
## minute time_hour
## Min. :10.00 Min. :2013-01-01 13:00:00.00
## 1st Qu.:29.00 1st Qu.:2013-04-02 01:00:00.00
## Median :30.00 Median :2013-07-01 13:00:00.00
## Mean :30.62 Mean :2013-07-02 19:32:05.34
## 3rd Qu.:35.00 3rd Qu.:2013-10-03 01:00:00.00
## Max. :44.00 Max. :2013-12-31 09:00:00.00
summarise(flights_ha,mean(air_time))
## # A tibble: 1 × 1
## `mean(air_time)`
## <dbl>
## 1 623.
This graph represents shows a clear positive correlation on distance and air time. The Primary investigator and associated team would like to look further into these results. Our preliminary results show that there is a set distance and a set air time associated to that distance. With increasing amount of air traffic, and the recent boom of in-flight disturbances, our team believes that air time would be taking a significantly relevant impact that could change how airlines handle on-board fuel. We also noticed that the distance traveled is based on theoretical distance not actual distance traveled. We are seeking additional follow-on funding to be able to do a deep dive analysis on fuel consumption between each of the carrier types and to review actual distance traveled to find risks associated to having delays and ultimately forcing units to stay in the sky.
Start early so that if you do have trouble, you can email me with questions