#code form rpubs by Aaron Kingston (Homework Assignment: Analyzing NYC Flight Data) #Download data and packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
?flights
## starting httpd help server ... done
view(flights)
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
#Create a column dep_datetime by combining year, month, day, and dep_time into a POSIXct datetime using lubridate. (Hint: Use make_datetime function to combine: year, month, day, for hour and min use division, e.g., hour = dep_time %/% 100, min = dep_time %% 100.) #Show the first 5 rows of flights with dep_datetime. #Output: First 5 rows showing year, month, day, dep_time, and dep_datetime.
flights <- flights |>
mutate(
dep_hour = dep_time %/% 100,
dep_min = dep_time %% 100,
dep_datetime = make_datetime(year, month, day, dep_hour, dep_min)
)
#In R, the %/% operator performs integer division. When applied to a numerical representation of time, such as a four-digit number representing hours and minutes (e.g., 1534 for 15:34), dividing by 100 using %/% will extract the hour component.This will output 15, which represents the hour component of the time.To extract the minutes component, the modulo operator %% would be used.
flights |>
select(year, month, day, dep_time, dep_datetime) |>
head(5)
## # A tibble: 5 × 5
## year month day dep_time dep_datetime
## <int> <int> <int> <int> <dttm>
## 1 2013 1 1 517 2013-01-01 05:17:00
## 2 2013 1 1 533 2013-01-01 05:33:00
## 3 2013 1 1 542 2013-01-01 05:42:00
## 4 2013 1 1 544 2013-01-01 05:44:00
## 5 2013 1 1 554 2013-01-01 05:54:00
#Using dep_datetime from Question 1, create a column weekday with the day of the week (e.g., “Mon”) using wday(dep_datetime, label = TRUE). Use table() to show how many flights occur on each weekday. #Output: The table of flight counts by weekday.
flights <- flights |>
mutate(
weekday = wday(dep_datetime, label = TRUE)
)
weekday_counts <- table(flights$weekday)
weekday_counts
##
## Sun Mon Tue Wed Thu Fri Sat
## 45643 49468 49273 48858 48654 48703 37922
#or
flights |>
count(weekday) |>
arrange()
## # A tibble: 8 × 2
## weekday n
## <ord> <int>
## 1 Sun 45643
## 2 Mon 49468
## 3 Tue 49273
## 4 Wed 48858
## 5 Thu 48654
## 6 Fri 48703
## 7 Sat 37922
## 8 <NA> 8255
#Convert the origin column to a factor called origin_factor. Show the factor levels with levels() and create a frequency table with table(). Make a bar plot of flights by airport using barplot().
flights <- flights |>
mutate(
origin_factor = as.factor(origin)
)
levels(flights$origin_factor)
## [1] "EWR" "JFK" "LGA"
origin_table = table(flights$origin_factor)
origin_table
##
## EWR JFK LGA
## 120835 111279 104662
flights |>
ggplot(aes(x = origin_factor))+
geom_bar()+
labs(
title = "Flights by airport",
x = "Airport",
y = "Number of flights"
)
#Recode origin_factor from Question 4 into a new column origin_recoded with full names: “JFK” to “Kennedy”, “LGA” to “LaGuardia”, “EWR” to “Newark” using fct_recode() or base R. Create a bar plot of the recoded factor.
flights <- flights |>
mutate(
origin_recoded = fct_recode(origin_factor,
"Kennedy" = "JFK",
"LaGuardia" = "LGA",
"Newark" = "EWR")
)
flights |>
ggplot(aes(x = origin_recoded))+
geom_bar()+
labs(
title = "Flights by airport",
x = "Airport",
y = "Number of flights"
)