#Load Tidyverse

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#Load NYC Flights Dataset

library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
#view(flights)
#describe(flights)
flights <- flights

#Filter: Hartsfield Jackson Atlanta International Airpot

Atlanta <- filter(flights, dest == "ATL")

Atlanta
## # A tibble: 17,215 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      554            600        -6      812            837
##  2  2013     1     1      600            600         0      837            825
##  3  2013     1     1      606            610        -4      837            845
##  4  2013     1     1      615            615         0      833            842
##  5  2013     1     1      658            700        -2      944            939
##  6  2013     1     1      754            759        -5     1039           1041
##  7  2013     1     1      807            810        -3     1043           1043
##  8  2013     1     1      814            810         4     1047           1030
##  9  2013     1     1      830            835        -5     1052           1105
## 10  2013     1     1      855            859        -4     1143           1145
## # ... with 17,205 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

#Filter: March Flights

MarchATL <- filter(Atlanta, month == 3)

MarchATL
## # A tibble: 1,448 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     3     1      559            600        -1      831            825
##  2  2013     3     1      600            600         0      848            837
##  3  2013     3     1      601            600         1      827            815
##  4  2013     3     1      611            615        -4      838            842
##  5  2013     3     1      617            615         2      859            855
##  6  2013     3     1      626            630        -4      904            905
##  7  2013     3     1      657            659        -2      931            938
##  8  2013     3     1      725            730        -5      949           1004
##  9  2013     3     1      751            759        -8     1028           1039
## 10  2013     3     1      806            810        -4     1023           1042
## # ... with 1,438 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

#Filter: Putting it all together

AtlSelect <- select(MarchATL, dep_time, dep_delay, arr_time, arr_delay)
#neg delay = EARLY arrival 

AtlSelect
## # A tibble: 1,448 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      559        -1      831         6
##  2      600         0      848        11
##  3      601         1      827        12
##  4      611        -4      838        -4
##  5      617         2      859         4
##  6      626        -4      904        -1
##  7      657        -2      931        -7
##  8      725        -5      949       -15
##  9      751        -8     1028       -11
## 10      806        -4     1023       -19
## # ... with 1,438 more rows
summary(MarchATL$arr_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       3    1115    1538    1514    1911    2357      41

#Lets Make a Table

ATLSchTable <- table(MarchATL$sched_arr_time)

ATLSchTable
## 
##  815  825  829  830  837  842  843  844  849  855  901  905  915  931  933  938 
##   30   21    5   25    1    5    1   25   25    1    4   21    5   20    6    1 
## 1004 1010 1011 1030 1031 1032 1039 1042 1043 1105 1106 1108 1126 1129 1135 1142 
##    1    5   25   15    5   25    1    1    8   31    5   25    5   25    1    5 
## 1143 1145 1159 1220 1234 1239 1246 1247 1248 1254 1302 1331 1337 1359 1402 1410 
##   25    1    5   26   30    1    5    5   20    1    7   25    1    5   25    1 
## 1420 1430 1438 1440 1509 1510 1516 1521 1530 1534 1535 1537 1541 1629 1630 1631 
##   18   31   29    2    3    2   22   25    1   31   21    1    9   30   10   24 
## 1632 1633 1635 1638 1644 1651 1654 1710 1737 1738 1742 1803 1809 1815 1822 1823 
##   25    1    1    2    9    4   15   31   25    5    1   25    5    1    4    1 
## 1824 1830 1833 1838 1840 1849 1855 1908 1917 1924 1925 1933 1935 1943 2003 2004 
##   25    1    9   24    2   22   26    5   21   29    1   30    1    1    9   19 
## 2019 2035 2042 2045 2050 2125 2130 2131 2132 2133 2134 2142 2145 2157 2235 2243 
##    3   25    1   25    1   22    5   25    4   10   25    1    1   25    5   25 
## 2245 2327 
##   27   17

#Head and Tails

AtlHead <- head(MarchATL$arr_time)

AtlHead
## [1] 831 848 827 838 859 904
Atltail <- tail(MarchATL$arr_time)

Atltail
## [1] 2121 2149 2237 2243 2319 2338
barplot(MarchATL$arr_time, MarchATL$sched_arr_time)

Show the difference between scheduled arrival time in March for Atlanta

New <- MarchATL %>%  #start here we are working specifically with the Atlanta data set
  mutate(arrdiff= arr_time - sched_arr_time) #shows the difference between two variables 
New
## # A tibble: 1,448 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     3     1      559            600        -1      831            825
##  2  2013     3     1      600            600         0      848            837
##  3  2013     3     1      601            600         1      827            815
##  4  2013     3     1      611            615        -4      838            842
##  5  2013     3     1      617            615         2      859            855
##  6  2013     3     1      626            630        -4      904            905
##  7  2013     3     1      657            659        -2      931            938
##  8  2013     3     1      725            730        -5      949           1004
##  9  2013     3     1      751            759        -8     1028           1039
## 10  2013     3     1      806            810        -4     1023           1042
## # ... with 1,438 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   arrdiff <int>
P1 <- New %>%
  ggplot(aes(carrier, arrdiff)) +
  geom_point(alpha=0.5, size=1, aes(color=origin)) +
  labs(y= "Arrival Differences", x= "Airline Carriers", subtitle="All planes arriving in Atlanta, GA March 2013")
P1
## Warning: Removed 41 rows containing missing values (geom_point).

Carrier goes on the X-Axis

Arrival Difference will go on the Y- Axis

Why? Arrival difference will be based on the carrier/ flights arriving in Atlanta March 2013

###END