library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
#view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                vars      n    mean      sd median trimmed     mad  min  max
## year              1 336776 2013.00    0.00   2013 2013.00    0.00 2013 2013
## month             2 336776    6.55    3.41      7    6.56    4.45    1   12
## day               3 336776   15.71    8.77     16   15.70   11.86    1   31
## dep_time          4 328521 1349.11  488.28   1401 1346.82  634.55    1 2400
## sched_dep_time    5 336776 1344.25  467.34   1359 1341.60  613.80  106 2359
## dep_delay         6 328521   12.64   40.21     -2    3.32    5.93  -43 1301
## arr_time          7 328063 1502.05  533.26   1535 1526.42  619.73    1 2400
## sched_arr_time    8 336776 1536.38  497.46   1556 1550.67  618.24    1 2359
## arr_delay         9 327346    6.90   44.63     -5   -1.03   20.76  -86 1272
## carrier*         10 336776    7.14    4.14      6    7.00    5.93    1   16
## flight           11 336776 1971.92 1632.47   1496 1830.51 1608.62    1 8500
## tailnum*         12 334264 1814.32 1199.75   1798 1778.21 1587.86    1 4043
## origin*          13 336776    1.95    0.82      2    1.94    1.48    1    3
## dest*            14 336776   50.03   28.12     50   49.56   32.62    1  105
## air_time         15 327346  150.69   93.69    129  140.03   75.61   20  695
## distance         16 336776 1039.91  733.23    872  955.27  569.32   17 4983
## hour             17 336776   13.18    4.66     13   13.15    5.93    1   23
## minute           18 336776   26.23   19.30     29   25.64   23.72    0   59
## time_hour        19 336776     NaN      NA     NA     NaN      NA  Inf -Inf
##                range  skew kurtosis   se
## year               0   NaN      NaN 0.00
## month             11 -0.01    -1.19 0.01
## day               30  0.01    -1.19 0.02
## dep_time        2399 -0.02    -1.09 0.85
## sched_dep_time  2253 -0.01    -1.20 0.81
## dep_delay       1344  4.80    43.95 0.07
## arr_time        2399 -0.47    -0.19 0.93
## sched_arr_time  2358 -0.35    -0.38 0.86
## arr_delay       1358  3.72    29.23 0.08
## carrier*          15  0.36    -1.21 0.01
## flight          8499  0.66    -0.85 2.81
## tailnum*        4042  0.17    -1.24 2.08
## origin*            2  0.09    -1.50 0.00
## dest*            104  0.13    -1.08 0.05
## air_time         675  1.07     0.86 0.16
## distance        4966  1.13     1.19 1.26
## hour              22  0.00    -1.21 0.01
## minute            59  0.09    -1.24 0.03
## time_hour       -Inf    NA       NA   NA
LAX_flights <- flights %>%
  filter(dest == "LAX")

If want departure delays of flights going to LAX, so I filtered the data for flights headed to LAX by doing (dest == “LAX”) Below is+6 a histogram of only departure delays of only those flights.

ggplot(data = LAX_flights, aes(x = dep_delay)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 98 rows containing non-finite values (stat_bin).

LAX_feb_flights <- flights %>%
  filter(dest == "LAX", month == 2)

I want to look at those flights going to LAX in February:

ggplot(data=LAX_feb_flights, aes(x=arr_delay)) + geom_histogram(bins = 50)
## Warning: Removed 43 rows containing non-finite values (stat_bin).

I want to look at every origin of planes going to LAX in February (Only EWR and JFK go to LAX in February)

ggplot(data=LAX_feb_flights, aes(x= origin, y=arr_delay)) + geom_boxplot()
## Warning: Removed 43 rows containing non-finite values (stat_boxplot).

arrival_more_than_one_hour_delayed <- filter(LAX_feb_flights, arr_delay >60)
summary(LAX_feb_flights$arr_delay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -68.000 -24.000 -11.000  -7.602   2.000 210.000      43

I want to look at every origin of planes going to LAX in February (Only EWR and JFK go to LAX in February) and their arrival delay that is more than 1 hour.

ggplot(data=arrival_more_than_one_hour_delayed, aes(x= origin, y=arr_delay)) + geom_boxplot()

### This shows the dataset flights, I mutated a new variable called “avg_speed”. I figured that average speed is distance diveded by the ait time. This means the distance traveled and the amount of time the plane in is the air.

flights <- flights %>%
  mutate(avg_speed = distance / (air_time/60))
distance_2 <- filter(flights, distance < 3000)

The x variable is distance and the y variable is avg_speed from dataset “flights”

filter(flights, distance < 3000)%>%
ggplot(data = distance_2, mapping = aes(distance,avg_speed, color = origin)) +
  geom_point()
## Warning: Removed 9424 rows containing missing values (geom_point).

flights_nona <- flights %>%
  filter(!is.na(distance) & !is.na(arr_delay) & !is.na(dep_delay))  

Create a dataframe that is composed of summary statistics

delays <- flights_nona %>%             # create a delays dataframe by:
  group_by (origin, month) %>%                  # grouping by point of destination
  summarize (count = n(),              # creating variables: number of flights to each destination,
             dist = mean (distance),   # the mean distance flown to each destination,
             avg_arr_delay = mean (arr_delay), # the mean delay of arrival to each destination,
             avg_dep_delay = mean (dep_delay))
## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.
ggplot(data = delays, aes(x = month, avg_arr_delay, color = origin)) +
  geom_point()+
      geom_line(lwd = 3) +xlab("month") +ylab("Average Arrival Delay")+
  ggtitle("Average Arrival Delays for Each Origin")

The visualization that I create was of the mean arrival delays for each origin, for each month of the year. The visualization shows that EWR starts out with the highest average of arrival delays and then ends with the highest arrival delays. The averages for each month are around the same throughout the year. I think we can highlight that there is a huge decline in delays during the summer months.