library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
To import the CSV file, I used readr. The file is saved as Arrival_Delays_Data.
df <- read_csv("~/Documents/Arrival_Delays_Data.csv")
## New names:
## Rows: 5 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): ...1, ...2 dbl (3): Los Angeles, San Diego, San Francisco num (2):
## Phoenix, Seattle
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## • `` -> `...2`
df
## # A tibble: 5 Ă— 7
## ...1 ...2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
glimpse(df)
## Rows: 5
## Columns: 7
## $ ...1 <chr> "ALASKA", NA, NA, "AM WEST", NA
## $ ...2 <chr> "on time", "delayed", NA, "on time", "delayed"
## $ `Los Angeles` <dbl> 497, 62, NA, 694, 117
## $ Phoenix <dbl> 221, 12, NA, 4840, 415
## $ `San Diego` <dbl> 212, 20, NA, 383, 65
## $ `San Francisco` <dbl> 503, 102, NA, 320, 129
## $ Seattle <dbl> 1841, 305, NA, 201, 61
I removed the row of NAs that separated Alaska and Am West Airlines:
df <- df[rowSums(is.na(df)) != ncol(df), ]
df
## # A tibble: 4 Ă— 7
## ...1 ...2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 <NA> delayed 117 415 65 129 61
I renamed the first and second columns to Airline and Arrival_Status, respectively:
colnames(df) [1] <- "Airline"
colnames(df) [2] <- "Arrival_Status"
I filled in the remaining NAs with the Airline representing delayed arrival flights:
df[c(2,4),1] <- df[c(1,3),1]
df
## # A tibble: 4 Ă— 7
## Airline Arrival_Status `Los Angeles` Phoenix `San Diego` San Francis…¹ Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
## # … with abbreviated variable name ¹​`San Francisco`
To change the table wide to long, I used pivot_longer(), renaming the cities as Destination and values to Total Number of Flights:
df_longer <- df %>%
pivot_longer(cols = c(3:7), names_to = "Destination", values_to = "Total_Number_of_Flights")
df_longer
## # A tibble: 20 Ă— 4
## Airline Arrival_Status Destination Total_Number_of_Flights
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San Diego 383
## 14 AM WEST on time San Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San Diego 65
## 19 AM WEST delayed San Francisco 129
## 20 AM WEST delayed Seattle 61
I stored the arrival delays into a new variable, df_delays, with the total number of delayed flights in descending order:
df_delays <- df_longer %>%
filter(Arrival_Status == "delayed") %>%
group_by(Destination) %>%
arrange(desc(Total_Number_of_Flights))
df_delays
## # A tibble: 10 Ă— 4
## # Groups: Destination [5]
## Airline Arrival_Status Destination Total_Number_of_Flights
## <chr> <chr> <chr> <dbl>
## 1 AM WEST delayed Phoenix 415
## 2 ALASKA delayed Seattle 305
## 3 AM WEST delayed San Francisco 129
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA delayed San Francisco 102
## 6 AM WEST delayed San Diego 65
## 7 ALASKA delayed Los Angeles 62
## 8 AM WEST delayed Seattle 61
## 9 ALASKA delayed San Diego 20
## 10 ALASKA delayed Phoenix 12
mean_delays <- df_delays %>%
group_by(Airline) %>%
summarise(Average_Delays = round(mean(Total_Number_of_Flights)), Number_of_Flights = n()) %>%
arrange(desc(Average_Delays))
mean_delays
## # A tibble: 2 Ă— 3
## Airline Average_Delays Number_of_Flights
## <chr> <dbl> <int>
## 1 AM WEST 157 5
## 2 ALASKA 100 5
I used pivot_wider in order to separate the arrival ontime and delay data. I stored the results in a new variable, dfdelays_ontime:
dfdelays_ontime <- df_longer %>%
pivot_wider(names_from = Arrival_Status, values_from = Total_Number_of_Flights)
dfdelays_ontime
## # A tibble: 10 Ă— 4
## Airline Destination `on time` delayed
## <chr> <chr> <dbl> <dbl>
## 1 ALASKA Los Angeles 497 62
## 2 ALASKA Phoenix 221 12
## 3 ALASKA San Diego 212 20
## 4 ALASKA San Francisco 503 102
## 5 ALASKA Seattle 1841 305
## 6 AM WEST Los Angeles 694 117
## 7 AM WEST Phoenix 4840 415
## 8 AM WEST San Diego 383 65
## 9 AM WEST San Francisco 320 129
## 10 AM WEST Seattle 201 61
colnames(dfdelays_ontime) [3] = "on_time"
I transformed the arrival on-time and delay data into percentages for each city, using kable to format the newly created table:
flight_percentages <- dfdelays_ontime %>%
mutate(total = delayed + on_time,
On_Time_Percentage = round(on_time/total*100),
Delayed_Percentage = round(delayed/total*100)) %>%
arrange(desc(Delayed_Percentage))
knitr::kable(flight_percentages, "pipe", align=c("l", "c", "c"))
| Airline | Destination | on_time | delayed | total | On_Time_Percentage | Delayed_Percentage |
|---|---|---|---|---|---|---|
| AM WEST | San Francisco | 320 | 129 | 449 | 71 | 29 |
| AM WEST | Seattle | 201 | 61 | 262 | 77 | 23 |
| ALASKA | San Francisco | 503 | 102 | 605 | 83 | 17 |
| AM WEST | San Diego | 383 | 65 | 448 | 85 | 15 |
| ALASKA | Seattle | 1841 | 305 | 2146 | 86 | 14 |
| AM WEST | Los Angeles | 694 | 117 | 811 | 86 | 14 |
| ALASKA | Los Angeles | 497 | 62 | 559 | 89 | 11 |
| ALASKA | San Diego | 212 | 20 | 232 | 91 | 9 |
| AM WEST | Phoenix | 4840 | 415 | 5255 | 92 | 8 |
| ALASKA | Phoenix | 221 | 12 | 233 | 95 | 5 |
Looking at ontime and delay percentages separately:
Arrival_Ontime <- flight_percentages %>%
select(Airline, Destination, On_Time_Percentage) %>%
arrange(desc(On_Time_Percentage))
knitr::kable(Arrival_Ontime, "pipe", align=c("l", "c", "c"))
| Airline | Destination | On_Time_Percentage |
|---|---|---|
| ALASKA | Phoenix | 95 |
| AM WEST | Phoenix | 92 |
| ALASKA | San Diego | 91 |
| ALASKA | Los Angeles | 89 |
| ALASKA | Seattle | 86 |
| AM WEST | Los Angeles | 86 |
| AM WEST | San Diego | 85 |
| ALASKA | San Francisco | 83 |
| AM WEST | Seattle | 77 |
| AM WEST | San Francisco | 71 |
ggplot(Arrival_Ontime, aes(x=Destination, y=On_Time_Percentage, fill=Airline)) +
geom_bar(stat='identity', position='dodge', width=0.7) +
theme_minimal()
Arrival_Delayed <- flight_percentages %>%
select(Airline, Destination, Delayed_Percentage) %>%
arrange(desc(Delayed_Percentage))
knitr::kable(Arrival_Delayed, "pipe", align=c("l", "c", "c"))
| Airline | Destination | Delayed_Percentage |
|---|---|---|
| AM WEST | San Francisco | 29 |
| AM WEST | Seattle | 23 |
| ALASKA | San Francisco | 17 |
| AM WEST | San Diego | 15 |
| ALASKA | Seattle | 14 |
| AM WEST | Los Angeles | 14 |
| ALASKA | Los Angeles | 11 |
| ALASKA | San Diego | 9 |
| AM WEST | Phoenix | 8 |
| ALASKA | Phoenix | 5 |
ggplot(Arrival_Delayed, aes(x=Destination, y=Delayed_Percentage, fill=Airline)) +
geom_bar(stat='identity', position='dodge') +
theme_minimal()
I looked at the arrival ontime and delay data summary of each Airline, creating a boxplot that compares each Airline:
Arrival_Delayed %>%
filter(Airline == "ALASKA") %>%
select(Delayed_Percentage) %>%
summary()
## Delayed_Percentage
## Min. : 5.0
## 1st Qu.: 9.0
## Median :11.0
## Mean :11.2
## 3rd Qu.:14.0
## Max. :17.0
Arrival_Delayed %>%
filter(Airline == "AM WEST") %>%
select(Delayed_Percentage) %>%
summary()
## Delayed_Percentage
## Min. : 8.0
## 1st Qu.:14.0
## Median :15.0
## Mean :17.8
## 3rd Qu.:23.0
## Max. :29.0
ggplot(Arrival_Delayed, aes(Airline, Delayed_Percentage)) +
geom_boxplot()
Arrival_Ontime %>%
filter(Airline == "ALASKA") %>%
select(On_Time_Percentage) %>%
summary()
## On_Time_Percentage
## Min. :83.0
## 1st Qu.:86.0
## Median :89.0
## Mean :88.8
## 3rd Qu.:91.0
## Max. :95.0
Arrival_Ontime %>%
filter(Airline == "AM WEST") %>%
select(On_Time_Percentage) %>%
summary()
## On_Time_Percentage
## Min. :71.0
## 1st Qu.:77.0
## Median :85.0
## Mean :82.2
## 3rd Qu.:86.0
## Max. :92.0
ggplot(Arrival_Ontime, aes(Airline, On_Time_Percentage)) +
geom_boxplot()
When comparing the arrival on-time and delay performances between Alaska and Am West Airlines, Alaska performed better than Am West overall. Alaska has fewer delays than Am West, with an average of 11% compared to Am West which has an average of 17%. When comparing the on-time and delay performance of each city, each Airline’s highest average of delays and lowest arrival on-time performance occurred in San Francisco, which could indicate that its airport terminal may have a high frequency of plane traffic coming in and out of the city. Conversely, the highest average of ontime performance and lowest average of delays each airline occurred in Phoenix, which could indicate that there is less plane traffic in that city.