#Load Tidyverse
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Load NYC Flights Dataset
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#view(flights)
#describe(flights)
flights <- flights
#Filter: Hartsfield Jackson Atlanta International Airpot
Atlanta <- filter(flights, dest == "ATL")
Atlanta
## # A tibble: 17,215 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 554 600 -6 812 837
## 2 2013 1 1 600 600 0 837 825
## 3 2013 1 1 606 610 -4 837 845
## 4 2013 1 1 615 615 0 833 842
## 5 2013 1 1 658 700 -2 944 939
## 6 2013 1 1 754 759 -5 1039 1041
## 7 2013 1 1 807 810 -3 1043 1043
## 8 2013 1 1 814 810 4 1047 1030
## 9 2013 1 1 830 835 -5 1052 1105
## 10 2013 1 1 855 859 -4 1143 1145
## # ... with 17,205 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Filter: March Flights
MarchATL <- filter(Atlanta, month == 3)
MarchATL
## # A tibble: 1,448 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 3 1 559 600 -1 831 825
## 2 2013 3 1 600 600 0 848 837
## 3 2013 3 1 601 600 1 827 815
## 4 2013 3 1 611 615 -4 838 842
## 5 2013 3 1 617 615 2 859 855
## 6 2013 3 1 626 630 -4 904 905
## 7 2013 3 1 657 659 -2 931 938
## 8 2013 3 1 725 730 -5 949 1004
## 9 2013 3 1 751 759 -8 1028 1039
## 10 2013 3 1 806 810 -4 1023 1042
## # ... with 1,438 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Filter: Putting it all together
AtlSelect <- select(MarchATL, dep_time, dep_delay, arr_time, arr_delay)
#neg delay = EARLY arrival
AtlSelect
## # A tibble: 1,448 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 559 -1 831 6
## 2 600 0 848 11
## 3 601 1 827 12
## 4 611 -4 838 -4
## 5 617 2 859 4
## 6 626 -4 904 -1
## 7 657 -2 931 -7
## 8 725 -5 949 -15
## 9 751 -8 1028 -11
## 10 806 -4 1023 -19
## # ... with 1,438 more rows
summary(MarchATL$arr_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3 1115 1538 1514 1911 2357 41
#Lets Make a Table
ATLSchTable <- table(MarchATL$sched_arr_time)
ATLSchTable
##
## 815 825 829 830 837 842 843 844 849 855 901 905 915 931 933 938
## 30 21 5 25 1 5 1 25 25 1 4 21 5 20 6 1
## 1004 1010 1011 1030 1031 1032 1039 1042 1043 1105 1106 1108 1126 1129 1135 1142
## 1 5 25 15 5 25 1 1 8 31 5 25 5 25 1 5
## 1143 1145 1159 1220 1234 1239 1246 1247 1248 1254 1302 1331 1337 1359 1402 1410
## 25 1 5 26 30 1 5 5 20 1 7 25 1 5 25 1
## 1420 1430 1438 1440 1509 1510 1516 1521 1530 1534 1535 1537 1541 1629 1630 1631
## 18 31 29 2 3 2 22 25 1 31 21 1 9 30 10 24
## 1632 1633 1635 1638 1644 1651 1654 1710 1737 1738 1742 1803 1809 1815 1822 1823
## 25 1 1 2 9 4 15 31 25 5 1 25 5 1 4 1
## 1824 1830 1833 1838 1840 1849 1855 1908 1917 1924 1925 1933 1935 1943 2003 2004
## 25 1 9 24 2 22 26 5 21 29 1 30 1 1 9 19
## 2019 2035 2042 2045 2050 2125 2130 2131 2132 2133 2134 2142 2145 2157 2235 2243
## 3 25 1 25 1 22 5 25 4 10 25 1 1 25 5 25
## 2245 2327
## 27 17
#Head and Tails
AtlHead <- head(MarchATL$arr_time)
AtlHead
## [1] 831 848 827 838 859 904
Atltail <- tail(MarchATL$arr_time)
Atltail
## [1] 2121 2149 2237 2243 2319 2338
barplot(MarchATL$arr_time, MarchATL$sched_arr_time)
New <- MarchATL %>% #start here we are working specifically with the Atlanta data set
mutate(arrdiff= arr_time - sched_arr_time) #shows the difference between two variables
New
## # A tibble: 1,448 x 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 3 1 559 600 -1 831 825
## 2 2013 3 1 600 600 0 848 837
## 3 2013 3 1 601 600 1 827 815
## 4 2013 3 1 611 615 -4 838 842
## 5 2013 3 1 617 615 2 859 855
## 6 2013 3 1 626 630 -4 904 905
## 7 2013 3 1 657 659 -2 931 938
## 8 2013 3 1 725 730 -5 949 1004
## 9 2013 3 1 751 759 -8 1028 1039
## 10 2013 3 1 806 810 -4 1023 1042
## # ... with 1,438 more rows, and 12 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # arrdiff <int>
P1 <- New %>%
ggplot(aes(carrier, arrdiff)) +
geom_point(alpha=0.5, size=1, aes(color=origin)) +
labs(y= "Arrival Differences", x= "Airline Carriers", subtitle="All planes arriving in Atlanta, GA March 2013")
P1
## Warning: Removed 41 rows containing missing values (geom_point).
###END