Load the package

# Install and load required packages
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Data manipulation

# Load the airlines dataset from fivethirtyeight.com
url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv"
airlines <- read_csv(url)
## Rows: 56 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): airline
## dbl (7): avail_seat_km_per_week, incidents_85_99, fatal_accidents_85_99, fat...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Explore the dataset
glimpse(airlines)
## Rows: 56
## Columns: 8
## $ airline                <chr> "Aer Lingus", "Aeroflot*", "Aerolineas Argentin…
## $ avail_seat_km_per_week <dbl> 320906734, 1197672318, 385803648, 596871813, 18…
## $ incidents_85_99        <dbl> 2, 76, 6, 3, 2, 14, 2, 3, 5, 7, 3, 21, 1, 5, 4,…
## $ fatal_accidents_85_99  <dbl> 0, 14, 0, 1, 0, 4, 1, 0, 0, 2, 1, 5, 0, 3, 0, 0…
## $ fatalities_85_99       <dbl> 0, 128, 0, 64, 0, 79, 329, 0, 0, 50, 1, 101, 0,…
## $ incidents_00_14        <dbl> 0, 6, 1, 5, 2, 6, 4, 5, 5, 4, 7, 17, 1, 0, 6, 2…
## $ fatal_accidents_00_14  <dbl> 0, 1, 0, 0, 0, 2, 1, 1, 1, 0, 0, 3, 0, 0, 0, 0,…
## $ fatalities_00_14       <dbl> 0, 88, 0, 0, 0, 337, 158, 7, 88, 0, 0, 416, 0, …

Summary statistic

# Let's focus on only a few columns for simplicity
selected_columns <- c("airline", "avail_seat_km_per_week", "incidents_85_99", "fatal_accidents_85_99", "fatalities_85_99")

airlines_selected <- airlines %>%
  select(all_of(selected_columns)) %>%
  filter(!is.na(incidents_85_99))  # Remove rows with missing incident data

# Summary statistics
summary_stats <- airlines_selected %>%
  summarise(
    average_incidents = mean(incidents_85_99),
    average_fatal_accidents = mean(fatal_accidents_85_99),
    average_fatalities = mean(fatalities_85_99)
  )

print(summary_stats)
## # A tibble: 1 × 3
##   average_incidents average_fatal_accidents average_fatalities
##               <dbl>                   <dbl>              <dbl>
## 1              7.18                    2.18               112.

Data Visualization

# Let's create a bar chart to visualize the number of incidents for each airline
incident_chart <- airlines_selected %>%
  ggplot(aes(x = reorder(airline, incidents_85_99), y = incidents_85_99)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Number of Incidents by Airline",
       x = "Airline",
       y = "Number of Incidents") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(incident_chart)

Conclusion

This vignette covers loading data, cleaning and wrangling, calculating summary statistics, and creating a simple bar chart using the TidyVerse packages (dplyr and ggplot2) with the “airlines” dataset from fivethirtyeight.com.