Import Data
# excel file
airlines <- read_excel("../00_data/MyData.xlsx") %>%
mutate(n_events = as.numeric(n_events)) %>%
mutate(avail_seat_km_per_week = as.numeric(avail_seat_km_per_week))
airlines
## # A tibble: 336 × 6
## Ref airline avail_seat_km_per_week year_range type_…¹ n_eve…²
## <dbl> <chr> <dbl> <chr> <chr> <dbl>
## 1 NA Aer Lingus 320906734 85_99 incide… 2
## 2 2 Aeroflot* 1197672318 85_99 incide… 76
## 3 3 Aerolineas Argentinas 385803648 85_99 incide… 6
## 4 4 Aeromexico* 596871813 85_99 incide… 3
## 5 5 Air Canada 1865253802 85_99 incide… 2
## 6 6 Air France 3004002661 85_99 incide… 14
## 7 7 Air India* 869253552 85_99 incide… 2
## 8 8 Air New Zealand* 710174817 85_99 incide… 3
## 9 9 Alaska Airlines* 965346773 85_99 incide… 5
## 10 10 Alitalia 698012498 85_99 incide… 7
## # … with 326 more rows, and abbreviated variable names ¹​type_of_event,
## # ²​n_events
airlines
## # A tibble: 336 × 6
## Ref airline avail_seat_km_per_week year_range type_…¹ n_eve…²
## <dbl> <chr> <dbl> <chr> <chr> <dbl>
## 1 NA Aer Lingus 320906734 85_99 incide… 2
## 2 2 Aeroflot* 1197672318 85_99 incide… 76
## 3 3 Aerolineas Argentinas 385803648 85_99 incide… 6
## 4 4 Aeromexico* 596871813 85_99 incide… 3
## 5 5 Air Canada 1865253802 85_99 incide… 2
## 6 6 Air France 3004002661 85_99 incide… 14
## 7 7 Air India* 869253552 85_99 incide… 2
## 8 8 Air New Zealand* 710174817 85_99 incide… 3
## 9 9 Alaska Airlines* 965346773 85_99 incide… 5
## 10 10 Alitalia 698012498 85_99 incide… 7
## # … with 326 more rows, and abbreviated variable names ¹​type_of_event,
## # ²​n_events
Introduction
Questions
Variation
Visualizing distributions
airlines %>%
ggplot(aes(x = n_events)) +
geom_bar()

airlines %>%
ggplot(mapping = aes(x = n_events)) +
geom_histogram(binwidth = 0.5)

airlines %>%
filter(n_events < 25) %>%
ggplot(aes(x = n_events)) +
geom_histogram(binwidth = 0.5)

Typical values
airlines %>%
ggplot(aes(x = avail_seat_km_per_week)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values
airlines %>%
ggplot(aes(avail_seat_km_per_week)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

airlines %>%
ggplot(aes(avail_seat_km_per_week)) +
geom_histogram() +
coord_cartesian(ylim = c(0,50))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values
airlines %>%
filter(between(avail_seat_km_per_week,320906734, 596871813))
## # A tibble: 96 × 6
## Ref airline avail_seat_km_per_week year_range type_…¹ n_eve…²
## <dbl> <chr> <dbl> <chr> <chr> <dbl>
## 1 NA Aer Lingus 320906734 85_99 incide… 2
## 2 3 Aerolineas Argentinas 385803648 85_99 incide… 6
## 3 4 Aeromexico* 596871813 85_99 incide… 3
## 4 13 Austrian Airlines 358239823 85_99 incide… 1
## 5 14 Avianca 396922563 85_99 incide… 5
## 6 18 Condor 417982610 85_99 incide… 2
## 7 19 COPA 550491507 85_99 incide… 3
## 8 21 Egyptair 557699891 85_99 incide… 8
## 9 22 El Al 335448023 85_99 incide… 1
## 10 23 Ethiopian Airlines 488560643 85_99 incide… 25
## # … with 86 more rows, and abbreviated variable names ¹​type_of_event, ²​n_events
airlines %>%
ggplot(mapping = aes(x = year_range, y = avail_seat_km_per_week)) +
geom_point()

Covariation
A categorical and continuous variable
airlines %>%
ggplot(aes(x = year_range, y = n_events)) +
geom_boxplot()

Two categorical variables
airlines %>%
ggplot(airlines = airlines) +
geom_count(mapping = aes(x = type_of_event, y = year_range))+
theme(axis.text.x = element_text(angle = 90))

Two continous variables
airlines %>%
ggplot(airlines = airlines) +
geom_bin2d(mapping = aes(x = n_events, y = airline)) +
theme(axis.text.x = element_text(angle = 90))

Patterns and models
airlines %>%
ggplot(airlines = airlines) +
geom_boxplot(mapping = aes(x = n_events, y = airline)) +
theme(axis.text.x = element_text(angle = 90))
