library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readr)
library(gt)
I tried to create the data in the R markdown just to see if I can do it:
delays <- data.frame(airlines = c("ALASKA", "", "", "AM_WEST", ""),
status = c("On_time", "Delayed", "", "On_time", "Delayed"),
Los_Angeles = c(497, 62, NA, 694, 117),
Phoenix = c(221, 12, NA, 4840, 415),
San_Diego =c(212, 20, NA, 383, 65),
San_Francisco = c(503, 102, NA, 320, 129),
Seattle = c(1841, 305, NA, 201, 61))
delays
## airlines status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA On_time 497 221 212 503 1841
## 2 Delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM_WEST On_time 694 4840 383 320 201
## 5 Delayed 117 415 65 129 61
delays1 <- gt(delays)
delays1
airlines | status | Los_Angeles | Phoenix | San_Diego | San_Francisco | Seattle |
---|---|---|---|---|---|---|
ALASKA | On_time | 497 | 221 | 212 | 503 | 1841 |
Delayed | 62 | 12 | 20 | 102 | 305 | |
NA | NA | NA | NA | NA | ||
AM_WEST | On_time | 694 | 4840 | 383 | 320 | 201 |
Delayed | 117 | 415 | 65 | 129 | 61 |
gt_tbl <-
delays |>
gt(rowname_col = "airlines")
gt_tbl
status | Los_Angeles | Phoenix | San_Diego | San_Francisco | Seattle | |
---|---|---|---|---|---|---|
ALASKA | On_time | 497 | 221 | 212 | 503 | 1841 |
Delayed | 62 | 12 | 20 | 102 | 305 | |
NA | NA | NA | NA | NA | ||
AM_WEST | On_time | 694 | 4840 | 383 | 320 | 201 |
Delayed | 117 | 415 | 65 | 129 | 61 |
write.csv(delays, file = "delays.csv")
url_de <- "https://raw.githubusercontent.com/SalouaDaouki/Data607/main/airlinesDelays.csv"
Air_delays <- read.csv(file = url_de )
Air_delays
## X X.1 Los.Angeles Phoenix San.Diego San.Frnacisco Seattle
## 1 ALASKA on time 497 221 212 503 1840
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
colnames(Air_delays)[1] <- "airlines"
colnames(Air_delays)[2] <- "Status"
Air_delays
## airlines Status Los.Angeles Phoenix San.Diego San.Frnacisco Seattle
## 1 ALASKA on time 497 221 212 503 1840
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Air_delays <- drop_na(Air_delays)
Air_delays
## airlines Status Los.Angeles Phoenix San.Diego San.Frnacisco Seattle
## 1 ALASKA on time 497 221 212 503 1840
## 2 delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
Air_delays[Air_delays==""]<-NA
Air_delays <- Air_delays %>% fill(airlines, .direction = 'down')
Air_delays
## airlines Status Los.Angeles Phoenix San.Diego San.Frnacisco Seattle
## 1 ALASKA on time 497 221 212 503 1840
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
pivot_longer(data = data.frame, cols = columns.to.pivot, names_to = “New Column Name”, values_to = “New Column Name”)
Air_delays_long <- Air_delays %>%
pivot_longer(
cols = !c("airlines","Status"),
names_to = "Destination",
values_to = "Status_freq",
values_drop_na = TRUE,
)
Air_delays_long
## # A tibble: 20 × 4
## airlines Status Destination Status_freq
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Frnacisco 503
## 5 ALASKA on time Seattle 1840
## 6 ALASKA delayed Los.Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San.Diego 20
## 9 ALASKA delayed San.Frnacisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los.Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San.Diego 383
## 14 AM WEST on time San.Frnacisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los.Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San.Diego 65
## 19 AM WEST delayed San.Frnacisco 129
## 20 AM WEST delayed Seattle 61
Air_delays_long$Destination <- str_replace_all(Air_delays_long$Destination, "\\.", " ")
Air_delays_long
## # A tibble: 20 × 4
## airlines Status Destination Status_freq
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Frnacisco 503
## 5 ALASKA on time Seattle 1840
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Frnacisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San Diego 383
## 14 AM WEST on time San Frnacisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San Diego 65
## 19 AM WEST delayed San Frnacisco 129
## 20 AM WEST delayed Seattle 61
ALASKA_status <- Air_delays_long |>
filter(airlines == 'ALASKA')
ALASKA_status
## # A tibble: 10 × 4
## airlines Status Destination Status_freq
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Frnacisco 503
## 5 ALASKA on time Seattle 1840
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Frnacisco 102
## 10 ALASKA delayed Seattle 305
ggplot(ALASKA_status, aes(x = Status_freq, color = Status)) +
geom_density(linewidth = 0.75)
AMWEST_status <- Air_delays_long |>
filter(airlines == 'AM WEST')
AMWEST_status
## # A tibble: 10 × 4
## airlines Status Destination Status_freq
## <chr> <chr> <chr> <int>
## 1 AM WEST on time Los Angeles 694
## 2 AM WEST on time Phoenix 4840
## 3 AM WEST on time San Diego 383
## 4 AM WEST on time San Frnacisco 320
## 5 AM WEST on time Seattle 201
## 6 AM WEST delayed Los Angeles 117
## 7 AM WEST delayed Phoenix 415
## 8 AM WEST delayed San Diego 65
## 9 AM WEST delayed San Frnacisco 129
## 10 AM WEST delayed Seattle 61
ggplot(AMWEST_status, aes(x = Status_freq, color = Status)) +
geom_density(linewidth = 0.75)
Airlines_df <-
Air_delays_long %>%
group_by(airlines) %>%
summarise(
Total_flights = sum(Status_freq),
Total_delays = sum(ifelse(Status=='delayed', Status_freq, 0))
)
Airlines_df <-
Airlines_df %>%
mutate(
Total_on_time = Total_flights - Total_delays,
delay_percent = round((Total_delays / Total_flights) *100,2),
on_time_percent = round(((Total_flights - Total_delays) / Total_flights)*100,2)
) %>%
arrange(delay_percent)
Airlines_df
## # A tibble: 2 × 6
## airlines Total_flights Total_delays Total_on_time delay_percent on_time_perc…¹
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 AM WEST 7225 787 6438 10.9 89.1
## 2 ALASKA 3774 501 3273 13.3 86.7
## # … with abbreviated variable name ¹on_time_percent
Both airlines have less than 15% of delayed flights; Alaska has 3% difference of delays compared to AM West even thoug Am West airlines have more flights and more number of delayed flights. Meaning that AM West has more on time flights than Alaska.
Now let’s see destination wise:
Destination_df <-
Air_delays_long %>%
group_by(Destination) %>%
summarise(
Total_flights = sum(Status_freq),
Total_delays = sum(ifelse(Status=='delayed', Status_freq, 0))
)
Destination_df <-
Destination_df %>%
mutate(
Total_on_time = Total_flights - Total_delays,
delay_percent = round((Total_delays / Total_flights) *100,2),
on_time_percent = round(((Total_flights - Total_delays) / Total_flights)*100,2)
) %>%
arrange(delay_percent)
Destination_df
## # A tibble: 5 × 6
## Destination Total_flights Total_delays Total_on_time delay_percent on_time…¹
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Phoenix 5488 427 5061 7.78 92.2
## 2 San Diego 680 85 595 12.5 87.5
## 3 Los Angeles 1370 179 1191 13.1 86.9
## 4 Seattle 2407 366 2041 15.2 84.8
## 5 San Frnacisco 1054 231 823 21.9 78.1
## # … with abbreviated variable name ¹on_time_percent
Based on the table above, Phoenix tends to have more percentage (92%) of on_time flights, followed by San Diego. On the other hand, San Francisco seems to have more percentage of delayed flights.
write.csv(Air_delays_long, file = "Air_delays_long.csv")
write.csv(Airlines_df, file = "Airlines_df.csv")
write.csv(Destination_df, file = "Destination_df.csv")