a1 <- mpg %>%
mutate(mpg_overall = (cty+hwy)/2)
ggplot(a1) +
geom_histogram(aes(mpg_overall), binwidth = 2, boundary = 20) +
labs(title = "MPG Overall Histogram",
x = "Mean of City and Highway Fuel Consumption",
y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
ggplot(a1) +
geom_boxplot(aes(x = drv, y = mpg_overall)) +
labs(title = "Drive Train Types And Mean Of MPG",
x = "Drive Train Types",
y = "Mean of MPG") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
c1 <- a1 %>%
group_by(class) %>%
summarise(mean_MPGA = mean(mpg_overall)) %>%
arrange(desc(mean_MPGA))
c1
## # A tibble: 7 × 2
## class mean_MPGA
## <chr> <dbl>
## 1 subcompact 24.3
## 2 compact 24.2
## 3 midsize 23.0
## 4 2seater 20.1
## 5 minivan 19.1
## 6 suv 15.8
## 7 pickup 14.9
ggplot(a1) +
geom_histogram(aes(mpg_overall), binwidth = 2, boundary = 20) +
facet_wrap(~as.factor(year)~as.factor(cyl)) +
labs(title = "Composite effect Of Year And Cyl To MPG_Overall",
x = "MPG_Overall",
y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
a2 <- flights %>%
filter(year == 2013, month == 11, !is.na(arr_delay)) %>%
group_by(day) %>%
summarise(Avg_Adelay = mean(arr_delay)) %>%
arrange(desc(Avg_Adelay))
a2
## # A tibble: 30 × 2
## day Avg_Adelay
## <int> <dbl>
## 1 17 20.6
## 2 27 17.5
## 3 7 13.2
## 4 1 11.3
## 5 12 10.1
## 6 22 9.18
## 7 26 6.23
## 8 24 5.84
## 9 23 3.81
## 10 4 3.46
## # ℹ 20 more rows
Answer: Day 17 on November 2013 has the biggest average arrival delay.
b2 <- flights %>%
mutate(cancel_flight = case_when(
is.na(dep_time) & is.na(arr_time) ~ "Cancelled",
!is.na(dep_time) | !is.na(arr_time) ~ "Not Cancelled"
))
b2
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, cancel_flight <chr>
ggplot(b2) +
geom_density(aes(distance, colour = cancel_flight)) +
labs(title = "Density Graph of distance between cancelled flights",
x = "Distance",
y = "Density") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.2)))
d2 <- flights %>%
group_by(origin, dest) %>%
summarise(frequency = length(origin))
d2
## # A tibble: 224 × 3
## # Groups: origin [3]
## origin dest frequency
## <chr> <chr> <int>
## 1 EWR ALB 439
## 2 EWR ANC 8
## 3 EWR ATL 5022
## 4 EWR AUS 968
## 5 EWR AVL 265
## 6 EWR BDL 443
## 7 EWR BNA 2336
## 8 EWR BOS 5327
## 9 EWR BQN 297
## 10 EWR BTV 931
## # ℹ 214 more rows
nrow(d2)
## [1] 224
Answer: 224 unique routes.
e2 <- flights %>%
group_by(origin, dest) %>%
summarise(frequency = length(origin), distance = mean(distance))
ggplot(e2) +
geom_histogram(aes(distance), colour = "lightblue") +
labs(title = "Histogram Graph of flight distance",
x = "Distance",
y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.2)))
f2 <- b2 %>%
group_by(origin, dest) %>%
summarise(cancellation_ratio = sum(cancel_flight == "Cancelled", na.rm = TRUE) / length(cancel_flight)) %>%
arrange(desc(cancellation_ratio))
f2
## # A tibble: 224 × 3
## # Groups: origin [3]
## origin dest cancellation_ratio
## <chr> <chr> <dbl>
## 1 EWR LGA 1
## 2 LGA MHT 0.239
## 3 LGA DSM 0.146
## 4 LGA TYS 0.140
## 5 EWR JAC 0.130
## 6 LGA DAY 0.120
## 7 LGA CHO 0.115
## 8 LGA GRR 0.109
## 9 EWR CAE 0.0865
## 10 LGA BHM 0.0845
## # ℹ 214 more rows
Answer: The highest is the flight route from EWR to LGA.