1. The mpg data set

a.Create a new variable mpg_overall which is the average of city and highway fuel consumption in miles per gallon. Then create a histogram of this new variable with each group covering values of 20-22, 22-24 etc.

a1 <- mpg %>%
  mutate(mpg_overall = (cty+hwy)/2)

  ggplot(a1) +
    geom_histogram(aes(mpg_overall), binwidth = 2, boundary = 20) +
    labs(title = "MPG Overall Histogram",
           x = "Mean of City and Highway Fuel Consumption",
           y = "Frequency") +
    theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))

b.Create a graph to study the relationship between drive train types and mpg_overall

ggplot(a1) +
  geom_boxplot(aes(x = drv, y = mpg_overall)) +
  labs(title = "Drive Train Types And Mean Of MPG",
           x = "Drive Train Types",
           y = "Mean of MPG") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))

c.Create a table to find out which car class has the highest mean mpg_overall.

c1 <- a1 %>%
  group_by(class) %>%
  summarise(mean_MPGA = mean(mpg_overall)) %>%
  arrange(desc(mean_MPGA))

c1
## # A tibble: 7 × 2
##   class      mean_MPGA
##   <chr>          <dbl>
## 1 subcompact      24.3
## 2 compact         24.2
## 3 midsize         23.0
## 4 2seater         20.1
## 5 minivan         19.1
## 6 suv             15.8
## 7 pickup          14.9

d.Create a proper graph to study the composite effect of year and cyl to mpg_overall. You shall treat year and cyl as categorical variables in your graph.

ggplot(a1) +
  geom_histogram(aes(mpg_overall), binwidth = 2, boundary = 20) +
  facet_wrap(~as.factor(year)~as.factor(cyl)) +
  labs(title = "Composite effect Of Year And Cyl To MPG_Overall",
           x = "MPG_Overall",
           y = "Frequency") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))

2. The flights data set

a.For JFK airport, which day in November 2013 has the biggest average arrival delay? Create a table to answer the question.

a2 <- flights %>%
  filter(year == 2013, month == 11, !is.na(arr_delay)) %>%
  group_by(day) %>%
  summarise(Avg_Adelay = mean(arr_delay)) %>%
  arrange(desc(Avg_Adelay))

a2
## # A tibble: 30 × 2
##      day Avg_Adelay
##    <int>      <dbl>
##  1    17      20.6 
##  2    27      17.5 
##  3     7      13.2 
##  4     1      11.3 
##  5    12      10.1 
##  6    22       9.18
##  7    26       6.23
##  8    24       5.84
##  9    23       3.81
## 10     4       3.46
## # ℹ 20 more rows

Answer: Day 17 on November 2013 has the biggest average arrival delay.

b.Create a new variable cancel_flight which is Cancelled if the departure time or arrival time is NA, otherwise Not Cancelled.

b2 <- flights %>%
  mutate(cancel_flight = case_when(
    is.na(dep_time) & is.na(arr_time) ~ "Cancelled",
    !is.na(dep_time) | !is.na(arr_time) ~ "Not Cancelled"
  ))

b2
## # A tibble: 336,776 × 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, cancel_flight <chr>

c.Create a density graph that compares the distribution of distance between cancelled flights and non-cancelled flights.

ggplot(b2) +
  geom_density(aes(distance, colour = cancel_flight)) +
  labs(title = "Density Graph of distance between cancelled flights",
           x = "Distance",
           y = "Density") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.2)))

d.How many unique flight routes are there in the data set? That is, each unique combination of an origin airport and a destination airport (such as from EWR to ORD) is considered as a route. Create a table to answer the question.

d2 <- flights %>%
  group_by(origin, dest) %>%
  summarise(frequency = length(origin))

d2
## # A tibble: 224 × 3
## # Groups:   origin [3]
##    origin dest  frequency
##    <chr>  <chr>     <int>
##  1 EWR    ALB         439
##  2 EWR    ANC           8
##  3 EWR    ATL        5022
##  4 EWR    AUS         968
##  5 EWR    AVL         265
##  6 EWR    BDL         443
##  7 EWR    BNA        2336
##  8 EWR    BOS        5327
##  9 EWR    BQN         297
## 10 EWR    BTV         931
## # ℹ 214 more rows
nrow(d2)
## [1] 224

Answer: 224 unique routes.

e.Add distance as a column to the table you created in d). Hint: You should go back to the original flights data set and reconstruct the table with distance included. Create a histogram of distance for the route table.

e2 <- flights %>%
  group_by(origin, dest) %>%
  summarise(frequency = length(origin), distance = mean(distance))

ggplot(e2) +
  geom_histogram(aes(distance), colour = "lightblue") +
  labs(title = "Histogram Graph of flight distance",
           x = "Distance",
           y = "Frequency") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.2)))

f.Which route has the highest rate of flight cancellation? Create a table to answer the question.

f2 <- b2 %>%
  group_by(origin, dest) %>%
  summarise(cancellation_ratio = sum(cancel_flight == "Cancelled", na.rm = TRUE) / length(cancel_flight)) %>%
  arrange(desc(cancellation_ratio))

f2
## # A tibble: 224 × 3
## # Groups:   origin [3]
##    origin dest  cancellation_ratio
##    <chr>  <chr>              <dbl>
##  1 EWR    LGA               1     
##  2 LGA    MHT               0.239 
##  3 LGA    DSM               0.146 
##  4 LGA    TYS               0.140 
##  5 EWR    JAC               0.130 
##  6 LGA    DAY               0.120 
##  7 LGA    CHO               0.115 
##  8 LGA    GRR               0.109 
##  9 EWR    CAE               0.0865
## 10 LGA    BHM               0.0845
## # ℹ 214 more rows

Answer: The highest is the flight route from EWR to LGA.