library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
LAX_flights <- flights %>%
filter(dest == "LAX")
ggplot(data = LAX_flights, aes(x = dep_delay)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 98 rows containing non-finite values (stat_bin).
LAX_feb_flights <- flights %>%
filter(dest == "LAX", month == 2)
ggplot(data=LAX_feb_flights, aes(x=arr_delay)) + geom_histogram(bins = 50)
## Warning: Removed 43 rows containing non-finite values (stat_bin).
ggplot(data=LAX_feb_flights, aes(x= origin, y=arr_delay)) + geom_boxplot()
## Warning: Removed 43 rows containing non-finite values (stat_boxplot).
arrival_more_than_one_hour_delayed <- filter(LAX_feb_flights, arr_delay >60)
summary(LAX_feb_flights$arr_delay)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -68.000 -24.000 -11.000 -7.602 2.000 210.000 43
ggplot(data=arrival_more_than_one_hour_delayed, aes(x= origin, y=arr_delay)) + geom_boxplot()
### This shows the dataset flights, I mutated a new variable called “avg_speed”. I figured that average speed is distance diveded by the ait time. This means the distance traveled and the amount of time the plane in is the air.
flights <- flights %>%
mutate(avg_speed = distance / (air_time/60))
distance_2 <- filter(flights, distance < 3000)
filter(flights, distance < 3000)%>%
ggplot(data = distance_2, mapping = aes(distance,avg_speed, color = origin)) +
geom_point()
## Warning: Removed 9424 rows containing missing values (geom_point).
flights_nona <- flights %>%
filter(!is.na(distance) & !is.na(arr_delay) & !is.na(dep_delay))
Create a dataframe that is composed of summary statistics
delays <- flights_nona %>% # create a delays dataframe by:
group_by (origin, month) %>% # grouping by point of destination
summarize (count = n(), # creating variables: number of flights to each destination,
dist = mean (distance), # the mean distance flown to each destination,
avg_arr_delay = mean (arr_delay), # the mean delay of arrival to each destination,
avg_dep_delay = mean (dep_delay))
## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.
ggplot(data = delays, aes(x = month, avg_arr_delay, color = origin)) +
geom_point()+
geom_line(lwd = 3) +xlab("month") +ylab("Average Arrival Delay")+
ggtitle("Average Arrival Delays for Each Origin")