library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
library(nycflights13)
library(ggplot2)
flights_nona <- flights %>%
filter(!is.na(year) & !is.na(month) & !is.na(day) & !is.na(sched_dep_time) & !is.na(arr_time) & !is.na(sched_arr_time) & !is.na(hour) & !is.na(minute) & !is.na(time_hour)) # remove observations with NA values
delays <- flights_nona %>% # create a delays dataframe by:
group_by (carrier) %>% # grouping by carrier
summarize (count = n(), # creating variables: number of flights to each destination,
dist = mean (distance), # the mean distance flown to each destination,
delay = mean (arr_delay), # the mean delay of arrival to each destination,
delaycost = mean(count*delay/dist)) # delay cost index defined as:
# [(number of flights)*delay/distance] for a destination
delays <- arrange(delays, desc(count)) # sort the rows by count
head(delays) # look at the data
## # A tibble: 6 x 5
## carrier count dist delay delaycost
## <chr> <int> <dbl> <dbl> <dbl>
## 1 UA 57916 1532. NA NA
## 2 B6 54137 1070. NA NA
## 3 EV 51251 563. NA NA
## 4 DL 47746 1238. NA NA
## 5 AA 32059 1343. NA NA
## 6 MQ 25076 571. NA NA
delays_carrier <- delays %>% # create a delays_carrier dataframe by:
ggplot() +
geom_bar(aes(x=carrier, y=count, fill=carrier), # creating a bargraph
position = "dodge", stat = "identity") +
ggtitle("Airlines with the most Delays in 2013") +
ylab("Number of Delays") +
labs(fill = "Carrier")
delays_carrier # loot at the graph
top10 <- delays %>%
filter(carrier =="UA" | carrier =="B6" | carrier == "EV" | carrier == "DL" | carrier == "AA" | carrier == "MQ" | carrier == "9E" | carrier == "US" | carrier == "WN" | carrier == "VX") %>%
ggplot() +
geom_bar(aes(x= carrier, y=count, fill = carrier),
position = "dodge", stat = "identity") +
labs(ylab = "Number of Delays",
title = "Top 10 Airlines with Most Frequent Delays in 2013",
fill = "Carrier")
top10
This bargraph shows the status of delays for the top 10 airlines in 2013. We can see that UA airline is the most frequent delays carrier, followed by B6, DL, and EV carriers not far behind. VX seems to be the carrier that we can tell was the least delayed, though the carriers in the other category would be delayed more than a VX. Also, airlines with high delays appear to have been delayed 45000 to 58000 times, and since they are UA, B6, EV, and DL, these airlines should be avoided.