I, Tuan Linh Vu, hereby state that I have not gained information in any way not allowed by the exam rules during this exam, and that all work is my own.
mpg data setAfter loading tidyverse library, a data set named
mpg should be ready to explore. The following questions are
based on this data set.
mpg_overall which is the
average of city and highway fuel consumption in miles per gallon. Then
create a histogram of this new variable with each group covering values
of 20-22, 22-24 etc.mpg <- mpg %>%
mutate(mpg_overall = (cty + hwy) / 2)
ggplot(mpg, aes(x = mpg_overall)) +
geom_histogram(binwidth = 2, fill = "blue", color = "black") +
labs(
title = "Histogram of Overall MPG",
x = "Average MPG of City and Highway",
y = "Count"
) + theme(plot.title = element_text(hjust = 0.5))
mpg_overall.ggplot(mpg, aes(x = drv, y = mpg_overall, fill = drv)) +
geom_boxplot() +
labs(
title = "Relationship Between Drive Train Type and Overall MPG",
x = "Drive Train Type",
y = "Average MPG (City + Highway)"
) + theme(plot.title = element_text(hjust = 0.5))
mpg_overall.mpg %>%
group_by(class) %>%
summarise(mean_mpg_overall = mean(mpg_overall)) %>%
arrange(desc(mean_mpg_overall))
year and cyl to mpg_overall. You
shall treat year and cyl as categorical
variables in your graph.mpg <- mpg %>%
mutate(
year = factor(year),
cyl = factor(cyl))
ggplot(mpg, aes(x = cyl, y = mpg_overall, fill = cyl)) +
geom_boxplot() +
facet_wrap(~ year) +
labs(
title = "Composite Effect of Year and Cylinders on Overall MPG",
x = "Number of Cylinders",
y = "Average MPG of City and Highway"
) + theme(plot.title = element_text(hjust = 0.5))
flights data setFor the following tasks, use data set flights of the
nycflights13 package.
jfk_delay <- flights %>%
filter(origin == "JFK", month == 11, year == 2013) %>%
group_by(day) %>%
summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(avg_arr_delay))
jfk_delay
cancel_flight which is
Cancelled if the departure time or arrival time is
NA, otherwise Not Cancelled.flights <- flights %>%
mutate(cancel_flight = if_else(is.na(dep_time) | is.na(arr_time),
"Cancelled",
"Not Cancelled"))
head(flights)
distance between cancelled flights and non-cancelled
flights.ggplot(flights, aes(x = distance, fill = cancel_flight)) +
geom_density(alpha = 0.5) +
labs(
title = "Distribution of Flight Distance for Cancelled vs Non-Cancelled Flights",
x = "Distance",
y = "Density",
fill = "Flight Status"
)
##### Most flights occur at short to medium distances (below about 1000
miles) for both cancelled and non-cancelled flights. The distributions
overlap heavily, indicating that flight distance alone does not strongly
differentiate cancelled flights from non-cancelled flights.
routes <- flights %>%
distinct(origin, dest)
routes
nrow(routes)
## [1] 224
distance as a column to the table you created in
d).Hint: You should go back to the original flights data
set and reconstruct the table with distance included. Create a histogram
of distance for the route table.
routes <- flights %>%
distinct(origin, dest, distance)
routes
ggplot(routes, aes(x = distance)) +
geom_histogram(binwidth = 200, fill = "steelblue", color = "black") +
labs(
title = "Distribution of Flight Route Distances",
x = "Distance (miles)",
y = "Count"
) + theme(plot.title = element_text(hjust = 0.5))
flights <- flights %>%
mutate(cancel_flight = if_else(is.na(dep_time) | is.na(arr_time),
"Cancelled",
"Not Cancelled"))
route_cancel_rate <- flights %>%
group_by(origin, dest) %>%
summarise(
total_flights = n(),
cancelled_flights = sum(cancel_flight == "Cancelled"),
cancel_rate = cancelled_flights / total_flights
) %>%
arrange(desc(cancel_rate))
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
route_cancel_rate
flights data setThe following questions are also from flights data set.
Each question is worth 5% bonus points if answered correctly.
cancel_by_airline <- flights %>%
group_by(carrier) %>%
summarise(
total_flights = n(),
cancelled_flights = sum(cancel_flight == "Cancelled"),
cancel_rate = cancelled_flights / total_flights
)
ggplot(cancel_by_airline, aes(x = carrier, y = cancel_rate)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(
title = "Cancellation Rate by Airline",
x = "Airline",
y = "Cancellation Rate"
) + theme(plot.title = element_text(hjust = 0.5))
route_competition <- flights %>%
group_by(origin, dest) %>%
summarise(num_carriers = n_distinct(carrier))
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
route_competition %>%
filter(num_carriers == max(num_carriers))