#code form rpubs by Aaron Kingston (Homework Assignment: Analyzing NYC Flight Data) #Download data and packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
?flights
## starting httpd help server ... done
view(flights)
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

#Create a column dep_datetime by combining year, month, day, and dep_time into a POSIXct datetime using lubridate. (Hint: Use make_datetime function to combine: year, month, day, for hour and min use division, e.g., hour = dep_time %/% 100, min = dep_time %% 100.) #Show the first 5 rows of flights with dep_datetime. #Output: First 5 rows showing year, month, day, dep_time, and dep_datetime.

flights <- flights |> 
  mutate(
    dep_hour = dep_time %/% 100,
    dep_min = dep_time %% 100,
    dep_datetime = make_datetime(year, month, day, dep_hour, dep_min)
  )

#In R, the %/% operator performs integer division. When applied to a numerical representation of time, such as a four-digit number representing hours and minutes (e.g., 1534 for 15:34), dividing by 100 using %/% will extract the hour component.This will output 15, which represents the hour component of the time.To extract the minutes component, the modulo operator %% would be used.

flights |> 
  select(year, month, day, dep_time, dep_datetime) |> 
  head(5)
## # A tibble: 5 × 5
##    year month   day dep_time dep_datetime       
##   <int> <int> <int>    <int> <dttm>             
## 1  2013     1     1      517 2013-01-01 05:17:00
## 2  2013     1     1      533 2013-01-01 05:33:00
## 3  2013     1     1      542 2013-01-01 05:42:00
## 4  2013     1     1      544 2013-01-01 05:44:00
## 5  2013     1     1      554 2013-01-01 05:54:00

#Using dep_datetime from Question 1, create a column weekday with the day of the week (e.g., “Mon”) using wday(dep_datetime, label = TRUE). Use table() to show how many flights occur on each weekday. #Output: The table of flight counts by weekday.

flights <- flights |> 
  mutate(
    weekday = wday(dep_datetime, label = TRUE)
  )
weekday_counts <- table(flights$weekday)
weekday_counts
## 
##   Sun   Mon   Tue   Wed   Thu   Fri   Sat 
## 45643 49468 49273 48858 48654 48703 37922
#or

flights |> 
  count(weekday) |> 
  arrange()
## # A tibble: 8 × 2
##   weekday     n
##   <ord>   <int>
## 1 Sun     45643
## 2 Mon     49468
## 3 Tue     49273
## 4 Wed     48858
## 5 Thu     48654
## 6 Fri     48703
## 7 Sat     37922
## 8 <NA>     8255

#Convert the origin column to a factor called origin_factor. Show the factor levels with levels() and create a frequency table with table(). Make a bar plot of flights by airport using barplot().

flights <- flights |> 
  mutate(
    origin_factor = as.factor(origin)
  )

levels(flights$origin_factor)
## [1] "EWR" "JFK" "LGA"
origin_table = table(flights$origin_factor)
origin_table
## 
##    EWR    JFK    LGA 
## 120835 111279 104662
flights |> 
  ggplot(aes(x = origin_factor))+
  geom_bar()+
  labs(
    title = "Flights by airport",
    x = "Airport",
    y = "Number of flights"
  )

#Recode origin_factor from Question 4 into a new column origin_recoded with full names: “JFK” to “Kennedy”, “LGA” to “LaGuardia”, “EWR” to “Newark” using fct_recode() or base R. Create a bar plot of the recoded factor.

flights <- flights |> 
  mutate(
   origin_recoded = fct_recode(origin_factor,
                               "Kennedy" = "JFK",
                               "LaGuardia" = "LGA",
                               "Newark" = "EWR") 
  )


flights |> 
  ggplot(aes(x = origin_recoded))+
  geom_bar()+
  labs(
    title = "Flights by airport",
    x = "Airport",
    y = "Number of flights"
  )