# Install and load required packages
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the airlines dataset from fivethirtyeight.com
url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv"
airlines <- read_csv(url)
## Rows: 56 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): airline
## dbl (7): avail_seat_km_per_week, incidents_85_99, fatal_accidents_85_99, fat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Explore the dataset
glimpse(airlines)
## Rows: 56
## Columns: 8
## $ airline <chr> "Aer Lingus", "Aeroflot*", "Aerolineas Argentin…
## $ avail_seat_km_per_week <dbl> 320906734, 1197672318, 385803648, 596871813, 18…
## $ incidents_85_99 <dbl> 2, 76, 6, 3, 2, 14, 2, 3, 5, 7, 3, 21, 1, 5, 4,…
## $ fatal_accidents_85_99 <dbl> 0, 14, 0, 1, 0, 4, 1, 0, 0, 2, 1, 5, 0, 3, 0, 0…
## $ fatalities_85_99 <dbl> 0, 128, 0, 64, 0, 79, 329, 0, 0, 50, 1, 101, 0,…
## $ incidents_00_14 <dbl> 0, 6, 1, 5, 2, 6, 4, 5, 5, 4, 7, 17, 1, 0, 6, 2…
## $ fatal_accidents_00_14 <dbl> 0, 1, 0, 0, 0, 2, 1, 1, 1, 0, 0, 3, 0, 0, 0, 0,…
## $ fatalities_00_14 <dbl> 0, 88, 0, 0, 0, 337, 158, 7, 88, 0, 0, 416, 0, …
Summary statistic
# Let's focus on only a few columns for simplicity
selected_columns <- c("airline", "avail_seat_km_per_week", "incidents_85_99", "fatal_accidents_85_99", "fatalities_85_99")
airlines_selected <- airlines %>%
select(all_of(selected_columns)) %>%
filter(!is.na(incidents_85_99)) # Remove rows with missing incident data
# Summary statistics
summary_stats <- airlines_selected %>%
summarise(
average_incidents = mean(incidents_85_99),
average_fatal_accidents = mean(fatal_accidents_85_99),
average_fatalities = mean(fatalities_85_99)
)
print(summary_stats)
## # A tibble: 1 × 3
## average_incidents average_fatal_accidents average_fatalities
## <dbl> <dbl> <dbl>
## 1 7.18 2.18 112.
# Let's create a bar chart to visualize the number of incidents for each airline
incident_chart <- airlines_selected %>%
ggplot(aes(x = reorder(airline, incidents_85_99), y = incidents_85_99)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Number of Incidents by Airline",
x = "Airline",
y = "Number of Incidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(incident_chart)
This vignette covers loading data, cleaning and wrangling, calculating summary statistics, and creating a simple bar chart using the TidyVerse packages (dplyr and ggplot2) with the “airlines” dataset from fivethirtyeight.com.