I, Linh Le, hereby state that I have not gained information in any way not allowed by the exam rules during this exam, and that all work is my own.
# load required packages here
library(airports)
library(cherryblossom)
library(usdata)
library(tidyverse)
library(openintro)
library(nycflights13)
mpg data setAfter loading tidyverse library, a data set named
mpg should be ready to explore. The following questions are
based on this data set.
mpg_overall which is the
average of city and highway fuel consumption in miles per gallon. Then
create a histogram of this new variable with each group covering values
of 20-22, 22-24 etc.# Enter code here.
new_mpg <- mpg %>%
mutate(mpg_overall = (cty+hwy)/2)
new_mpg
ggplot(new_mpg) +
geom_histogram(aes(x=mpg_overall), binwidth = 2, boundary = 10)
mpg_overall.# Enter code here.
ggplot(data = new_mpg, mapping = aes(x = drv, y = mpg_overall)) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot()
Answer: Based on the graph, front wheel drive is the one has the highest mile per gallon while 4 wheel drive and rear wheel drive has the same range. For 4 wheel drive, in average, it can no go far with just 1 gallon which make sense because it use more energy to control 4 wheel differently. for ther front wheel drive it has a lot of outlier data, some of them can even run 40 mile per gallon, but overall it can run alot more which 1 gallon compare to the others.
mpg_overall.# Enter code here.
highst<- new_mpg %>%
group_by(class)%>%
summarise(mean= mean(mpg_overall))%>%
arrange(desc(mean))
print(highst)
## # A tibble: 7 × 2
## class mean
## <chr> <dbl>
## 1 subcompact 24.3
## 2 compact 24.2
## 3 midsize 23.0
## 4 2seater 20.1
## 5 minivan 19.1
## 6 suv 15.8
## 7 pickup 14.9
Answer: Subcompact class has the highest mean of overall miles per gallon with 24.3 mpg
year and cyl to mpg_overall. You
shall treat year and cyl as categorical
variables in your graph.# Enter code here.
ggplot(data = new_mpg) +
geom_point(mapping = aes(x = cyl, y = mpg_overall)) +
facet_wrap(~ factor(year)) +
labs(title = "Vehicle Fuel Economy Data by Year and Number of Cylinders",
x = "Number of Cylinders",
y = "Overall Mile per Gallon") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5), margin = margin(15,15,15,15)),
axis.title = element_text(size = rel(1.2)),
axis.title.x = element_text(margin = margin(10,5,5,5)),
axis.title.y = element_text(margin = margin(5,10,5,5)),
axis.text = element_text(size = rel(1.2)))
# Enter code here.
ggplot(data = new_mpg, aes(x= factor(cyl), y = mpg_overall)) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot() +
facet_wrap(~ factor(year)) +
labs(title = "Vehicle Fuel Economy Data by Year and Number of Cylinders",
x = "Number of Cylinders",
y = "Overall Mile per Gallon") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5), margin = margin(15,15,15,15)),
axis.title = element_text(size = rel(1.2)),
axis.title.x = element_text(margin = margin(10,5,5,5)),
axis.title.y = element_text(margin = margin(5,10,5,5)),
axis.text = element_text(size = rel(1.2)))
# Enter code here.
ggplot(data = new_mpg) +
stat_summary(mapping = aes(x= factor(cyl), y = mpg_overall, fill = factor(year)), fun = "mean", geom = "bar", position = "dodge")+
labs(title = "Vehicle Fuel Economy Data by Year and Number of Cylinders",
x = "Number of Cylinders",
y = "Mean of Overall Mile per Gallon") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5), margin = margin(15,15,15,15)),
axis.title = element_text(size = rel(1.2)),
axis.title.x = element_text(margin = margin(10,5,5,5)),
axis.title.y = element_text(margin = margin(5,10,5,5)),
axis.text = element_text(size = rel(1.2)))
Answer:
Based on the three graph above, We can tell that the Mile per Gallon for car market that was manufacture in 2008 is more stable with less outlier and in general can run further compare to the car that was manufacture in 1999. A special thing is that there are cars in 2008 that use 5 cylinders and it’s stably can run for almost 25 miles per gallon. In 1999 they didn’t have this 5 cylinders option.
flights data setFor the following tasks, use data set flights of the
nycflights13 package.
# Enter code here.
nov23<- flights %>%
filter(origin == "JFK", month == 11, year == 2013, !is.na(arr_delay))%>%
dplyr :: select(year,month,day, arr_delay)%>%
arrange(desc(arr_delay))
print(nov23)
## # A tibble: 8,645 × 4
## year month day arr_delay
## <int> <int> <int> <dbl>
## 1 2013 11 24 614
## 2 2013 11 27 396
## 3 2013 11 11 344
## 4 2013 11 27 334
## 5 2013 11 26 324
## 6 2013 11 24 304
## 7 2013 11 2 299
## 8 2013 11 24 290
## 9 2013 11 27 285
## 10 2013 11 27 281
## # ℹ 8,635 more rows
Answer:
For JFK as the origin, The 24th of November 2013 has the biggest average arrival delay with 614 mins delay
cancel_flight which is
Cancelled if the departure time or arrival time is
NA, otherwise Not Cancelled.# Enter code here.
cancel_data <- flights%>%
mutate(cancel_flight = ifelse(is.na(dep_time)| is.na(arr_time), "Cancelled", "Not Cancelled"))
filter(cancel_data, cancel_flight == "Cancelled")
print(cancel_data)
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, cancel_flight <chr>
Answer:
cancel_data <- flights%>% mutate(cancel_flight =
ifelse(is.na(dep_time)| is.na(arr_time), “Cancelled”, “Not Cancelled”))
distance between cancelled flights and non-cancelled
flights.# Enter code here.
ggplot(cancel_data)+
geom_density(aes(x = distance, fill = cancel_flight))+
labs(title = "Distance vs Cancelation", x = "Distance (mile)", y = "") +
scale_y_continuous(labels = scales::comma)
ggplot(cancel_data) +
stat_summary(mapping = aes(x = cancel_flight, y = distance, fill = cancel_flight), geom = "bar", fun = "mean") +
labs(title = "Mean Distance vs Cancelation", x = "", y = "Mean Distance (mile)")
# Enter code here.
flight_route <- flights%>%
group_by(origin, dest)%>%
summarise(count= n()) %>%
dplyr :: select(origin, dest)
flight_route
print(nrow(flight_route))
## [1] 224
Answer:
There are 224 unique flight routes
distance as a column to the table you created in
d).Hint: You should go back to the original flights data
set and reconstruct the table with distance included. Create a histogram
of distance for the route table.
# Enter code here.
flight_route_d <- flights%>%
group_by(origin, dest, distance)%>%
summarise(count= n(), )%>%
dplyr :: select(origin, dest, distance)
flight_route_d
ggplot(flight_route_d)+
geom_histogram(aes(distance))
# This is just for fun
ggplot(flight_route_d)+
geom_histogram(aes(distance, fill = dest, color = origin))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
# Enter code here.
flight_route_c <- cancel_data%>%
group_by(origin, dest, cancel_flight)%>%
summarise(count = n())
flight_route_c_t <- flight_route_c %>%
pivot_wider(names_from = cancel_flight, values_from = count)
flight_route_c_f <- flight_route_c_t %>%
mutate ( Cancel_rate = Cancelled/(`Cancelled` + `Not Cancelled`))%>%
arrange(desc(Cancel_rate))
flight_route_c_f
Answer:
Route from LGA to MHT has the highest rate of flight cancellation with 23.94%
flights data setThe following questions are also from flights data set.
Each question is worth 5% bonus points if answered correctly.
# Enter code here.
flight_route_b <- cancel_data%>%
group_by(carrier, cancel_flight)%>%
summarise(count = n())%>%
pivot_wider(names_from = cancel_flight, values_from = count) %>%
mutate ( Cancel_rate = Cancelled/(`Cancelled` + `Not Cancelled`))%>%
arrange(Cancel_rate)
flight_route_b
ggplot(flight_route_b)+
stat_summary(aes(x = carrier, y =Cancel_rate, fill = carrier), geom = "bar", fun = "mean")+
labs(title = "Mean cancel rate by Airline",
x = "Airline",
y = "Mean Cancel Rate")
Answer: AS airline has the smallest cancel rate
# Enter code here.
Answer: