wide_df <- read_csv('https://raw.githubusercontent.com/jacshap/Data607/refs/heads/main/Assignment_5A_Flight_Data.csv')
## New names:
## Rows: 4 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): ...1, ...2 dbl (5): Los Angeles, Phoenix, San Diego, San Francisco,
## Seattle
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## • `` -> `...2`
print(wide_df)
## # A tibble: 4 × 7
## ...1 ...2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AMWEST on time 694 4840 383 320 201
## 4 <NA> delayed 117 415 65 129 61
wide_df <- wide_df %>% rename(Airline = ...1, Status = ...2)
wide_df[2,1] = "ALASKA"
wide_df[4,1] = "AMWEST"
df <- wide_df %>% pivot_longer(cols = 'Los Angeles':'Seattle',
names_to = 'City',
values_to = "Amount"
)
Make table of average delay per airline.
avg_delays <- df %>% group_by(Airline) %>% filter(Status == "delayed") %>% summarize(avg_delay = mean(Amount))
print(avg_delays)
## # A tibble: 2 × 2
## Airline avg_delay
## <chr> <dbl>
## 1 ALASKA 100.
## 2 AMWEST 157.
ggplot(data = avg_delays, aes(x=Airline, y=avg_delay, fill = Airline)) +
geom_bar(stat = 'identity') +
labs(title = "Average Delay per Airline", x = "Airline", y = "Average Delay") +
theme_light()
AMWEST has, on average, over 50% more delays than ALASKA.
Make table per airline of percentage of delay rates.
perc_delays <- df %>% group_by(Airline) %>% summarize(percent_delayed = sum(Amount[Status=="delayed"])*100/sum(Amount[Status=="on time"]))
print(perc_delays)
## # A tibble: 2 × 2
## Airline percent_delayed
## <chr> <dbl>
## 1 ALASKA 15.3
## 2 AMWEST 12.2
ggplot(data = perc_delays, aes(x=Airline, y=percent_delayed, fill = Airline)) +
geom_bar(stat = 'identity') +
labs(title = "Delay Percentage per Airline", x = "Airline", y = "Percent Delay") +
theme_light()
Despite ALASKA having less overall mean delays, they have the higher percentage of delayed flights.
Same as above but also group by city.
perc_city_delays <- df %>% group_by(Airline, City) %>% summarize(percent_delayed = sum(Amount[Status=="delayed"])*100/sum(Amount[Status=="on time"])) %>% arrange(desc(percent_delayed))
## `summarise()` has grouped output by 'Airline'. You can override using the
## `.groups` argument.
print(perc_city_delays)
## # A tibble: 10 × 3
## # Groups: Airline [2]
## Airline City percent_delayed
## <chr> <chr> <dbl>
## 1 AMWEST San Francisco 40.3
## 2 AMWEST Seattle 30.3
## 3 ALASKA San Francisco 20.3
## 4 AMWEST San Diego 17.0
## 5 AMWEST Los Angeles 16.9
## 6 ALASKA Seattle 16.6
## 7 ALASKA Los Angeles 12.5
## 8 ALASKA San Diego 9.43
## 9 AMWEST Phoenix 8.57
## 10 ALASKA Phoenix 5.43
#reorder puts city in descending order
ggplot(data = perc_city_delays, aes(x=reorder(City, -percent_delayed), y=percent_delayed, fill = Airline)) +
geom_bar(stat = 'identity', position = 'dodge') +
labs(title = "Delay Percentage per City", x = "City", y = "Percent Delay") +
theme_light()
Both airlines have a relative amount of delays depending on the city, with the same trend among the same cities but AMWEST consistently having higher percentage of delays. Based on this, the city or airport may make the biggest difference in amount of delays. San Francisco and Seattle had especially high amounts of delays for AMWEST.
City-by-city we can see that ALASKA has a lower delay, but overall Alaska has the higher percentage of delayed flights.
df %>% group_by(Airline) %>% summarize(sum(Amount))
## # A tibble: 2 × 2
## Airline `sum(Amount)`
## <chr> <dbl>
## 1 ALASKA 3775
## 2 AMWEST 7225
AMWEST had about double the amount of flights compared to ALASKA from the 5 cities inspected. We saw that overall there were more delays for AMWEST, but by overall percentage the performance was better for AMWEST, which is explained by AMWEST having so many more flights. However, on a city-by-city basis we see AMWEST consistently having more delays. If we did a weighted average we’d find that AMWEST would have a higher average percent of delays.
# Trying different things to get weighted average
#AMtot <- df %>% filter(Airline == "AMWEST") %>% summarize(sum(Amount))
#make list of weights - the total amount of flights for each airline
#weighting <- c(3775, 3775, 3775, 3775, 3775, 3775, 3775, 3775, 3775, 3775, 7225, 7225, 7225, 7225, 7225, 7225, 7225, 7225, 7225, 7225)
#weighting <- 3775/7225
#df %>% group_by(Airline) %>% filter(Status == "delayed") %>% summarize(weighted_mean = weighted.mean(Amount, w = weighting))
# Simpler shown via the below
AL_tot <- 3775
AM_tot <- 7225
AL_weight <- AL_tot / sum(AL_tot, AM_tot)
AM_weight <- AM_tot / sum(AL_tot, AM_tot)
perc_delays %>% mutate(weighted_percents = percent_delayed*c(AL_weight,AM_weight))
## # A tibble: 2 × 3
## Airline percent_delayed weighted_percents
## <chr> <dbl> <dbl>
## 1 ALASKA 15.3 5.25
## 2 AMWEST 12.2 8.03