1)Loading data

In this assignment, the data was loaded from a CSV file. A column name was added for the first and second column in the data frame obtained from the CSV. Additionally, any missing values in the carrier column were populated with the previous record.

df_flights <- read_csv("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/assigment5.csv",show_col_types = FALSE)
## New names:
## • `` -> `...1`
## • `` -> `...2`
#I will rename the first columns
colnames(df_flights)[c(1,2)] <- c("carrier_name","flight_status")

#Before transformation the data source I will analyze the structure
#str(df_flights)

#Fill Down missing values for carrier_name
df_flights <- df_flights |>
  fill(carrier_name, .direction = "down") 

df_flights
## # A tibble: 5 × 7
##   carrier_name flight_status `Los Angeles` Phoenix `San Diego` `San Francisco`
##   <chr>        <chr>                 <dbl>   <dbl>       <dbl>           <dbl>
## 1 Alaska       on time                 497     221         212             503
## 2 Alaska       delayed                  62      12          20             102
## 3 Alaska       <NA>                     NA      NA          NA              NA
## 4 AMWEST       on time                 694    4840         383             320
## 5 AMWEST       delayed                 117     415          65             129
## # ℹ 1 more variable: Seattle <dbl>

2) Now we will identify the variables and observations

df_flights <- df_flights %>%
  pivot_longer(
    #Column names that start with a capital letter will become observations for a new column state
    cols = colnames(df_flights)[grep("^[A-Z]", colnames(df_flights))], 
    names_to = "destination",
    values_to = "number_of_flights",
    values_drop_na = TRUE
  )

#to use this function you need to install DT 

datatable(df_flights)

3) Perform analysis to compare the arrival delays for the two airlines.

A statistical summary was created to analyze the data. The carrier with the highest average number of delayed flights is AMMEST and with the most average on-time flights is AMMWEST. Additionally, AMWEST has a greater total number of flights compared to Alaska.

summary_table <- df_flights |>
  group_by(carrier_name, flight_status) |>
  summarise(
    Mean = round(mean(number_of_flights), 1),
    Median = median(number_of_flights),
    SD = round(sd(number_of_flights), 1),
    Min = min(number_of_flights),
    Max = max(number_of_flights),
    Count = sum(number_of_flights)
  )
## `summarise()` has grouped output by 'carrier_name'. You can override using the
## `.groups` argument.
# Display the summary table using kable
summary_table  |>
  kable(caption = "Statistical Summary of Flights by Airline and Flight Status")  |>
  kable_styling(full_width = F, position = "center")
Statistical Summary of Flights by Airline and Flight Status
carrier_name flight_status Mean Median SD Min Max Count
AMWEST delayed 157.4 117 147.2 61 415 787
AMWEST on time 1287.6 383 1994.2 201 4840 6438
Alaska delayed 100.2 62 120.0 12 305 501
Alaska on time 654.8 497 678.1 212 1841 3274

When comparing based on destination and status, we can determine that Alaska has a higher percentage of on-time flights in Los Angeles, Phoenix, San Diego, San Francisco, and Seattle. In contrast, AMWEST has the highest percentage of delayed flights.

summaryt1 <- df_flights |>
  group_by(carrier_name, destination, flight_status) |>
  summarize(total_flights = sum(number_of_flights)) |>
  mutate(per_occurence = round(total_flights / sum(total_flights) * 100, 2))
## `summarise()` has grouped output by 'carrier_name', 'destination'. You can
## override using the `.groups` argument.
summaryt1  |>
  kable(caption = "Flights by Carrier and Destination")  |>
  kable_styling(full_width = F, position = "center")
Flights by Carrier and Destination
carrier_name destination flight_status total_flights per_occurence
AMWEST Los Angeles delayed 117 14.43
AMWEST Los Angeles on time 694 85.57
AMWEST Phoenix delayed 415 7.90
AMWEST Phoenix on time 4840 92.10
AMWEST San Diego delayed 65 14.51
AMWEST San Diego on time 383 85.49
AMWEST San Francisco delayed 129 28.73
AMWEST San Francisco on time 320 71.27
AMWEST Seattle delayed 61 23.28
AMWEST Seattle on time 201 76.72
Alaska Los Angeles delayed 62 11.09
Alaska Los Angeles on time 497 88.91
Alaska Phoenix delayed 12 5.15
Alaska Phoenix on time 221 94.85
Alaska San Diego delayed 20 8.62
Alaska San Diego on time 212 91.38
Alaska San Francisco delayed 102 16.86
Alaska San Francisco on time 503 83.14
Alaska Seattle delayed 305 14.21
Alaska Seattle on time 1841 85.79
perc_flight_des_plot <-ggplot(summaryt1, aes(x = destination, y = per_occurence, fill = carrier_name)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = paste(per_occurence,'%')), 
            position = position_dodge(width = 0.9), 
            vjust = 0.5, angle = 90, hjust =1, color="#7F7F7F") +  
  facet_wrap(~ flight_status) +
  labs(y = "Percentange of Flights", x = "Destination",title = "Flights by Airline and Destination",
       subtitle = "Comparison between status")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_fill_manual('Airline', values=c('#003863','#007461'))


# Convert ggplot to an interactive plotly plot
interactive_flights_geom_bar <- ggplotly(perc_flight_des_plot)

# Display the interactive scatter plot
interactive_flights_geom_bar

Overall, when we calculate the proportion of flights that arrived on time, we can see that AMWEST had the highest percentage at 89.11% .

summaryt2 <- df_flights |>
  group_by(carrier_name,flight_status) |>
  summarize(total_flights = sum(number_of_flights)) |>
  mutate(per_occurence = round(total_flights / sum(total_flights) * 100, 2))
## `summarise()` has grouped output by 'carrier_name'. You can override using the
## `.groups` argument.
summaryt2  |>
  kable(caption = "Flights by Carrier and Status")  |>
  kable_styling(full_width = F, position = "center")
Flights by Carrier and Status
carrier_name flight_status total_flights per_occurence
AMWEST delayed 787 10.89
AMWEST on time 6438 89.11
Alaska delayed 501 13.27
Alaska on time 3274 86.73
flights_geom_bar <- 
ggplot(summaryt2, aes(x = flight_status, y = per_occurence, fill = carrier_name)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = paste(per_occurence,'%')), 
            position = position_dodge(width = 1), 
            vjust = 0.5, angle = 90, hjust =1, color="#7F7F7F") +
  labs(y = "Percentange of Flights", x = "Airline", fill = "Carrier",title = "Flights by Status",
       subtitle = "Comparison between status")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_fill_manual('Airline', values=c('#003863','#007461'))



# Convert ggplot to an interactive plotly plot
interactive_flights_scatter_plot <- ggplotly(flights_geom_bar)

# Display the interactive scatter plot
interactive_flights_scatter_plot

  1. A-Z↩︎