I Charles hereby state that I have not gained information in any way not allowed by the exam rules during this exam, and that all work is my own.
# load required packages here
library(tidyverse)
library(nycflights13)
mpg data setAfter loading tidyverse library, a data set named
mpg should be ready to explore. The following questions are
based on this data set.
mpg_overall which is the
average of city and highway fuel consumption in miles per gallon. Then
create a histogram of this new variable with each group covering values
of 20-22, 22-24 etc.# Enter code here.
mpg <- mpg %>%
mutate(mpg_overall = (cty + hwy) / 2)
ggplot(mpg, aes(x = mpg_overall)) +
geom_histogram(binwidth = 2, boundary = 20, color = "black", fill = "steelblue") +
labs(title = "Histogram of Overall Fuel Consumption",
x = "Overall MPG",
y = "Count") +
scale_x_continuous(breaks = seq(10, 45, by = 2))
mpg_overall.# Enter code here.
ggplot(mpg, aes(x = drv, y = mpg_overall, fill = drv)) +
geom_boxplot() +
labs(title = "Overall MPG by Drive Train",
x = "Drive Train (f = front, r = rear, 4 = 4wd)",
y = "Overall MPG") +
theme_minimal()
Answer:
mpg_overall.# Enter code here.
mpg %>%
group_by(class) %>%
summarize(mean_mpg_overall = mean(mpg_overall)) %>%
arrange(desc(mean_mpg_overall))
Answer:
year and cyl to mpg_overall. You
shall treat year and cyl as categorical
variables in your graph.ggplot(mpg, aes(x = factor(year), y = mpg_overall, fill = factor(cyl))) +
geom_boxplot() +
labs(title = "Effect of Year and Cylinders on Overall MPG",
x = "Year",
y = "Overall MPG",
fill = "Cylinders") +
theme_classic()
Answer:
flights data setFor the following tasks, use data set flights of the
nycflights13 package.
# Enter code here.
flights %>%
filter(origin == "JFK", year == 2013, month == 11) %>%
group_by(day) %>%
summarize(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(avg_arr_delay))
Answer: Day 27
cancel_flight which is
Cancelled if the departure time or arrival time is
NA, otherwise Not Cancelled.# Enter code here.
flights <- flights %>%
mutate(cancel_flight = ifelse(is.na(dep_time) | is.na(arr_time),
"Cancelled",
"Not Cancelled"))
Answer:
distance between cancelled flights and non-cancelled
flights.# Enter code here.
ggplot(flights, aes(x = distance, fill = cancel_flight)) +
geom_density(alpha = 0.5) +
labs(title = "Distance Distribution: Cancelled vs. Not Cancelled Flights",
x = "Distance",
y = "Density",
fill = "Status")
# Enter code here.
route_table <- flights %>%
distinct(origin, dest)
nrow(route_table)
## [1] 224
Answer: 224 unique flight routes
distance as a column to the table you created in
d).Hint: You should go back to the original flights data
set and reconstruct the table with distance included. Create a histogram
of distance for the route table.
# Enter code here.
route_table_with_dist <- flights %>%
distinct(origin, dest, distance)
ggplot(route_table_with_dist, aes(x = distance)) +
geom_histogram(binwidth = 200, color = "black", fill = "lightgreen") +
labs(title = "Histogram of Distances for Unique Flight Routes",
x = "Distance",
y = "Count")
# Enter code here.
flights %>%
group_by(origin, dest) %>%
summarize(
total_flights = n(),
cancel_count = sum(cancel_flight == "Cancelled"),
cancel_rate = cancel_count / total_flights,
.groups = "drop"
) %>%
arrange(desc(cancel_rate))
Answer:EWR to LGA
flights data setThe following questions are also from flights data set.
Each question is worth 5% bonus points if answered correctly.
# Enter code here.
airline_cancel_rates <- flights %>%
group_by(carrier) %>%
summarize(cancel_rate = mean(cancel_flight == "Cancelled"))
ggplot(airline_cancel_rates, aes(x = reorder(carrier, cancel_rate), y = cancel_rate)) +
geom_col(fill = "coral") +
labs(title = "Cancellation Rates by Airline",
x = "Airline (Carrier)",
y = "Cancellation Rate") +
theme_minimal()
Answer: HA with 0%
# Enter code here.
competitive_routes <- flights %>%
group_by(origin, dest) %>%
summarize(num_carriers = n_distinct(carrier), .groups = "drop") %>%
arrange(desc(num_carriers))
competitive_routes %>%
filter(num_carriers == max(num_carriers))
Answer: