Import Data

# excel file
airlines <- read_excel("../00_data/MyData.xlsx") %>%
     mutate(n_events = as.numeric(n_events)) %>%
    mutate(avail_seat_km_per_week = as.numeric(avail_seat_km_per_week))
airlines
## # A tibble: 336 × 6
##      Ref airline               avail_seat_km_per_week year_range type_…¹ n_eve…²
##    <dbl> <chr>                                  <dbl> <chr>      <chr>     <dbl>
##  1    NA Aer Lingus                         320906734 85_99      incide…       2
##  2     2 Aeroflot*                         1197672318 85_99      incide…      76
##  3     3 Aerolineas Argentinas              385803648 85_99      incide…       6
##  4     4 Aeromexico*                        596871813 85_99      incide…       3
##  5     5 Air Canada                        1865253802 85_99      incide…       2
##  6     6 Air France                        3004002661 85_99      incide…      14
##  7     7 Air India*                         869253552 85_99      incide…       2
##  8     8 Air New Zealand*                   710174817 85_99      incide…       3
##  9     9 Alaska Airlines*                   965346773 85_99      incide…       5
## 10    10 Alitalia                           698012498 85_99      incide…       7
## # … with 326 more rows, and abbreviated variable names ¹​type_of_event,
## #   ²​n_events
airlines
## # A tibble: 336 × 6
##      Ref airline               avail_seat_km_per_week year_range type_…¹ n_eve…²
##    <dbl> <chr>                                  <dbl> <chr>      <chr>     <dbl>
##  1    NA Aer Lingus                         320906734 85_99      incide…       2
##  2     2 Aeroflot*                         1197672318 85_99      incide…      76
##  3     3 Aerolineas Argentinas              385803648 85_99      incide…       6
##  4     4 Aeromexico*                        596871813 85_99      incide…       3
##  5     5 Air Canada                        1865253802 85_99      incide…       2
##  6     6 Air France                        3004002661 85_99      incide…      14
##  7     7 Air India*                         869253552 85_99      incide…       2
##  8     8 Air New Zealand*                   710174817 85_99      incide…       3
##  9     9 Alaska Airlines*                   965346773 85_99      incide…       5
## 10    10 Alitalia                           698012498 85_99      incide…       7
## # … with 326 more rows, and abbreviated variable names ¹​type_of_event,
## #   ²​n_events

Introduction

Questions

Variation

Visualizing distributions

airlines %>%
    ggplot(aes(x = n_events)) +
    geom_bar()

airlines %>%
    ggplot(mapping = aes(x = n_events)) +
    geom_histogram(binwidth = 0.5)

airlines %>%
    
    filter(n_events < 25) %>%
    
    ggplot(aes(x = n_events)) +
    geom_histogram(binwidth = 0.5)

Typical values

airlines %>%
      ggplot(aes(x = avail_seat_km_per_week)) + 
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values

airlines %>%
    ggplot(aes(avail_seat_km_per_week)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

airlines %>%
    ggplot(aes(avail_seat_km_per_week)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0,50)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

airlines %>%
filter(between(avail_seat_km_per_week,320906734, 596871813))
## # A tibble: 96 × 6
##      Ref airline               avail_seat_km_per_week year_range type_…¹ n_eve…²
##    <dbl> <chr>                                  <dbl> <chr>      <chr>     <dbl>
##  1    NA Aer Lingus                         320906734 85_99      incide…       2
##  2     3 Aerolineas Argentinas              385803648 85_99      incide…       6
##  3     4 Aeromexico*                        596871813 85_99      incide…       3
##  4    13 Austrian Airlines                  358239823 85_99      incide…       1
##  5    14 Avianca                            396922563 85_99      incide…       5
##  6    18 Condor                             417982610 85_99      incide…       2
##  7    19 COPA                               550491507 85_99      incide…       3
##  8    21 Egyptair                           557699891 85_99      incide…       8
##  9    22 El Al                              335448023 85_99      incide…       1
## 10    23 Ethiopian Airlines                 488560643 85_99      incide…      25
## # … with 86 more rows, and abbreviated variable names ¹​type_of_event, ²​n_events
airlines %>%
    ggplot(mapping = aes(x = year_range, y = avail_seat_km_per_week)) + 
    geom_point()

Covariation

A categorical and continuous variable

airlines %>%
    
    ggplot(aes(x = year_range, y = n_events)) +
    geom_boxplot()

Two categorical variables

airlines %>%
    ggplot(airlines = airlines) +
    geom_count(mapping = aes(x = type_of_event, y = year_range))+
    theme(axis.text.x = element_text(angle = 90))

Two continous variables

airlines %>%
    ggplot(airlines = airlines) +
    geom_bin2d(mapping = aes(x = n_events, y = airline)) +
    theme(axis.text.x = element_text(angle = 90))

Patterns and models

airlines %>%
    ggplot(airlines = airlines) + 
    geom_boxplot(mapping = aes(x = n_events, y = airline)) +
    theme(axis.text.x = element_text(angle = 90))