Using nycflights13, a built-in dataset in R, to answer 5 questions including:
Also, there is a description about the process from data transformation to data visualization.
library(nycflights13)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(RColorBrewer)
# load dataset
data("flights")
# head dataset
head(flights)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# explore dataset
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
# check missing values
ifelse(mean(complete.cases(flights)) == 1, "No NA Founded", "Found NA")
## [1] "Found NA"
This dataset contains missing values, next we will remove the missing values.
# remove missing values
flights <- na.omit(flights)
Before creating charts, we have to convert the data type from character data to factor data because we mainly create bar charts, factor are more efficient to store and process. For example, a barplot of factor data will automatically group the bars by the different levels of the factor. This makes it easy to compare the different levels of the factor.
# convert character to factor
flights <- flights %>%
mutate_if(is.character, as.factor)
First, we want to find the delayed fights in June to December, so we
have to filter necessary data. In this case, the
destination = MIA and month = June to
December.
# filter MIA destination and delayed flights between June to December
MIA_flights <- flights %>%
filter(dest == "MIA", month >= 6 & month <= 12, dep_delay > 0)
MIA_flights
## # A tibble: 2,409 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 1 758 735 23 1052 1035
## 2 2013 10 1 822 800 22 1100 1117
## 3 2013 10 1 1111 1110 1 1406 1430
## 4 2013 10 1 1252 1245 7 1538 1600
## 5 2013 10 1 1316 1257 19 1553 1602
## 6 2013 10 1 1509 1505 4 1808 1830
## 7 2013 10 1 1742 1735 7 2018 2055
## 8 2013 10 1 1824 1815 9 2118 2135
## 9 2013 10 1 1945 1935 10 2244 2250
## 10 2013 10 1 2030 2028 2 2306 2339
## # ℹ 2,399 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <fct>, flight <int>,
## # tailnum <fct>, origin <fct>, dest <fct>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
As month column is integer, it is necessary to convert
data type to factor and order the factor before reorder the chart.
# Convert month (int) to factor
MIA_flights$month <-
factor(
MIA_flights$month,
levels = 6:12,
labels = c(
"June",
"July",
"August",
"September",
"October",
"November",
"December"
),
ordered = TRUE
)
# Count delay by month and create chart
MIA_flights_chart <- MIA_flights %>%
count(month) %>%
ggplot(aes(
x = reorder(month, -n),
y = n,
fill = month
)) +
geom_col() +
theme_minimal() +
scale_fill_brewer(palette = "Paired") +
geom_label(aes(label = n), vjust = -0.2, label.size = NA, fill = "white") +
labs(title = "MIA Flight Delays from June to December 2013",
x = "Month",
y = "Number of Flight Delays",
fill = "Flight Delay Months")
MIA_flights_chart
The plot above is displayed as a bar graph. The bar graph shows that December has the highest amount of flight delays in 2013 with 526 times, while September has the least amount of flight delays with 229 times.
pop_flights_dec <- flights %>%
filter(month == 12) %>%
count(dest) %>%
arrange(-n) %>%
head(10) %>%
ggplot(., aes(x = reorder(dest, -n), y = n, fill = dest)) +
geom_bar(stat = "identity") +
theme_minimal() +
geom_label(aes(label = n), vjust = -0.2, label.size = NA, fill = "white") +
labs(title = "The Most Popular Destination in December",
x = "Destination",
y = "Count")
pop_flights_dec
The plot above shows that top 10 popular destination in December are Hartsfield-Jackson Atlanta International Airport (ATL), Los Angeles International Airport (LAX), Orlando International Airport (MCO), with 1429, 1390, 1203 flights respectively.
# filter origin flights from JFK
JFK_flights <- flights %>%
filter(origin == "JFK") %>%
select(month, day, origin, dest, tailnum)
JFK_flights
## # A tibble: 109,079 × 5
## month day origin dest tailnum
## <int> <int> <fct> <fct> <fct>
## 1 1 1 JFK MIA N619AA
## 2 1 1 JFK BQN N804JB
## 3 1 1 JFK MCO N593JB
## 4 1 1 JFK PBI N793JB
## 5 1 1 JFK TPA N657JB
## 6 1 1 JFK LAX N29129
## 7 1 1 JFK BOS N708JB
## 8 1 1 JFK ATL N3739P
## 9 1 1 JFK SFO N532UA
## 10 1 1 JFK RSW N635JB
## # ℹ 109,069 more rows
# find an average monthly
JFK_flights_avg_monthly <- JFK_flights %>%
group_by(month) %>%
summarise(avg_flights = mean(n()))
JFK_flights_avg_monthly
## # A tibble: 12 × 2
## month avg_flights
## <int> <dbl>
## 1 1 9031
## 2 2 8007
## 3 3 9497
## 4 4 9013
## 5 5 9270
## 6 6 9182
## 7 7 9757
## 8 8 9870
## 9 9 8788
## 10 10 9096
## 11 11 8645
## 12 12 8923
# convert month to factor
JFK_flights_avg_monthly$month <-
factor(
JFK_flights_avg_monthly$month,
levels = 1:12,
labels = c(
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
),
ordered = TRUE
)
# create a chart
JFK_flights_avg_monthly_chart <-
ggplot(JFK_flights_avg_monthly, aes(x = month, y = avg_flights, fill = month)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
coord_flip() +
scale_fill_brewer(palette = "Paired") +
geom_label(aes(label = avg_flights), hjust = 0, label.size = NA, fill = "white") +
labs(title = "Average Monthly Flights Departing from JFK in 2013",
x = "Month",
y = "Number of Flights")
JFK_flights_avg_monthly_chart
This bar graph above shows that the most average flights departing from John F. Kennedy International Airport (JFK) was in August 2013 with 9870 flights, followed by July with 9757 flights and March 9497 flights.
airline_delays <- flights %>%
filter(dep_delay > 0) %>%
group_by(carrier) %>%
summarise(sum_delays = round(sum(dep_delay)/60), 0) %>%
arrange(desc(sum_delays)) %>%
left_join(airlines, by = "carrier") %>%
head(10)
airline_delays
## # A tibble: 10 × 4
## carrier sum_delays `0` name
## <chr> <dbl> <dbl> <chr>
## 1 EV 19221 0 ExpressJet Airlines Inc.
## 2 B6 14145 0 JetBlue Airways
## 3 UA 13468 0 United Air Lines Inc.
## 4 DL 9449 0 Delta Air Lines Inc.
## 5 AA 6259 0 American Airlines Inc.
## 6 MQ 5940 0 Envoy Air
## 7 9E 5639 0 Endeavor Air Inc.
## 8 WN 3788 0 Southwest Airlines Co.
## 9 US 2613 0 US Airways Inc.
## 10 VX 1264 0 Virgin America
# create a chart
airline_delays_plot <- ggplot(airline_delays, aes(x = reorder(carrier, -sum_delays), y = sum_delays, fill = carrier)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
scale_fill_brewer(palette = "Set3") +
geom_label(aes(label = sum_delays), vjust = -0.2, label.size = NA, fill = "white") +
labs(title = "Top 10 Airlines with the Most Delayed Departures",
x = "Airlines",
y = "Time of Delays (Hour)")
airline_delays_plot
After analyzing top 10 airlines with the most delayed departures, we found that ExpressJet Airlines (EV) has the most amount of flight delays with 19221 hours, followed by JetBlue Airways (B6) with 14145 hours and United Air Lines (UA) 13468 hours.
top_plane_manufacturers <- planes %>%
group_by(manufacturer) %>%
count(manufacturer) %>%
arrange(-n) %>%
head(5)
top_plane_manufacturers
## # A tibble: 5 × 2
## # Groups: manufacturer [5]
## manufacturer n
## <chr> <int>
## 1 BOEING 1630
## 2 AIRBUS INDUSTRIE 400
## 3 BOMBARDIER INC 368
## 4 AIRBUS 336
## 5 EMBRAER 299
# create a chart
top_plane_manufacturers_plot <- ggplot(top_plane_manufacturers, aes(x = reorder(manufacturer, n), y = n,
fill = manufacturer)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
geom_label(aes(label = n), hjust = -0.2, label.size = NA, fill = "white") +
labs(title = "Top 5 Airplane Manufacturers to NYC",
x = "Name of Manufacturer",
y = "Count")
top_plane_manufacturers_plot
Top 5 airplane manufacturers flying to New York in 2013 is BOEING, AIRBUS INDUSTRIE, BOMBERBADIER INC, AIRBUS and EMBRAER.