Assignment – Tidying and Transforming Data

Step 1: Create connection with MySQL

#Above code will be hidden for password privacy purpose
#alternative way to read from csv file 
#url = 'https://raw.githubusercontent.com/mehreengillani/DATA607/refs/heads/main/flight_info.csv'
#flights <- read.csv(url)
#head(flights)

Step 2: load data from MySQL

flights <- dbReadTable(con, "flight_delays")
flights
##    id airline       airport ontime delay
## 1   1  ALASKA   Los.Angeles    497    62
## 2   2  ALASKA       Phoenix    221    12
## 3   3  ALASKA     San.Diego    212    20
## 4   4  ALASKA San.Francisco    503   102
## 5   5  ALASKA       Seattle   1841   305
## 6   6 AM_WEST   Los.Angeles    694   117
## 7   7 AM_WEST       Phoenix   4840   415
## 8   8 AM_WEST     San.Diego    383    65
## 9   9 AM_WEST San.Francisco    320   129
## 10 10 AM_WEST       Seattle    201    61

Step 3: Check null values, show summary of numeric columns, find unique values for categorical variables

# Check for missing values in the dataset
colSums(is.na(flights))  # number of NAs per column
##      id airline airport  ontime   delay 
##       0       0       0       0       0
# Summary for numeric columns 
summary(flights[sapply(flights, is.numeric)])
##        id            ontime           delay       
##  Min.   : 1.00   Min.   : 201.0   Min.   : 12.00  
##  1st Qu.: 3.25   1st Qu.: 245.8   1st Qu.: 61.25  
##  Median : 5.50   Median : 440.0   Median : 83.50  
##  Mean   : 5.50   Mean   : 971.2   Mean   :128.80  
##  3rd Qu.: 7.75   3rd Qu.: 646.2   3rd Qu.:126.00  
##  Max.   :10.00   Max.   :4840.0   Max.   :415.00
#print unique values for airlines
unique(flights$airline)
## [1] "ALASKA"  "AM_WEST"
#printunique values for airports
unique(flights$airport)
## [1] "Los.Angeles"   "Phoenix"       "San.Diego"     "San.Francisco"
## [5] "Seattle"

There are no null values. There are two airlines and five destination airports. Max number of delayed flights are 415 with average of 128. Max number of on time flights are 4840 with mean value of 971.

Step 4: Tidy the data (wide → long)

flights_long <- flights %>%
  pivot_longer(cols = c(ontime, delay),
               names_to = "status",
               values_to = "count")
flights_long
## # A tibble: 20 × 5
##       id airline airport       status count
##    <int> <chr>   <chr>         <chr>  <int>
##  1     1 ALASKA  Los.Angeles   ontime   497
##  2     1 ALASKA  Los.Angeles   delay     62
##  3     2 ALASKA  Phoenix       ontime   221
##  4     2 ALASKA  Phoenix       delay     12
##  5     3 ALASKA  San.Diego     ontime   212
##  6     3 ALASKA  San.Diego     delay     20
##  7     4 ALASKA  San.Francisco ontime   503
##  8     4 ALASKA  San.Francisco delay    102
##  9     5 ALASKA  Seattle       ontime  1841
## 10     5 ALASKA  Seattle       delay    305
## 11     6 AM_WEST Los.Angeles   ontime   694
## 12     6 AM_WEST Los.Angeles   delay    117
## 13     7 AM_WEST Phoenix       ontime  4840
## 14     7 AM_WEST Phoenix       delay    415
## 15     8 AM_WEST San.Diego     ontime   383
## 16     8 AM_WEST San.Diego     delay     65
## 17     9 AM_WEST San.Francisco ontime   320
## 18     9 AM_WEST San.Francisco delay    129
## 19    10 AM_WEST Seattle       ontime   201
## 20    10 AM_WEST Seattle       delay     61

Step 5: Calculate delay rate per airline

airline_delay_summary <- flights %>%
  group_by(airline) %>%
  summarise(
    total_flights = sum(ontime + delay),
    total_delays  = sum(delay),
    delay_rate    =  total_delays / total_flights, 
    on_time_rate    = sum(ontime) / total_flights
  ) %>%
  arrange(desc(delay_rate))

#arrival delay rates per airline
airline_delay_summary
## # A tibble: 2 × 5
##   airline total_flights total_delays delay_rate on_time_rate
##   <chr>           <int>        <int>      <dbl>        <dbl>
## 1 ALASKA           3775          501      0.133        0.867
## 2 AM_WEST          7225          787      0.109        0.891

Step 6: Visualization

6.1 Arrival Delay vs On-time flights percentage by Airline

library(ggplot2)

# Convert to long format for plotting
airline_long <- airline_delay_summary %>%
  pivot_longer(cols = c(delay_rate, on_time_rate),
               names_to = "status",
               values_to = "rate")

# Plot side-by-side bars (with % in title and axis label)
ggplot(airline_long, aes(x = airline, y = rate, fill = status)) +
  geom_col(position = position_dodge(width = 0.6), width = 0.6) +
  geom_text(aes(label = paste0(round(rate*100, 1), "%")),
            position = position_dodge(width = 0.6),
            vjust = -0.5, size = 3) +
  labs(title = "Arrival Delay vs On-time Percentage by Airline",
       x = "Airline",
       y = "Percentage") +   # changed here
  scale_y_continuous(labels = scales::percent_format(), limits = c(0, 1.1)) +
  scale_fill_manual(values = c("delay_rate" = "darkblue", "on_time_rate" = "skyblue"),
                    labels = c("Delay", "On-time")) +
  theme_minimal()

# Calculate total flights, delays, and rates per destination airport
airport_summary <- flights %>%
  group_by(airport) %>%
  summarise(
    total_flights = sum(ontime + delay),
    total_delays  = sum(delay),
    total_ontime  = sum(ontime),
    delay_rate    = total_delays / total_flights,
    on_time_rate  = total_ontime / total_flights
  ) %>%
  arrange(desc(delay_rate))

airport_summary
## # A tibble: 5 × 6
##   airport       total_flights total_delays total_ontime delay_rate on_time_rate
##   <chr>                 <int>        <int>        <int>      <dbl>        <dbl>
## 1 San.Francisco          1054          231          823     0.219         0.781
## 2 Seattle                2408          366         2042     0.152         0.848
## 3 Los.Angeles            1370          179         1191     0.131         0.869
## 4 San.Diego               680           85          595     0.125         0.875
## 5 Phoenix                5488          427         5061     0.0778        0.922

6.2 Arrival Delay vs On-time flights percentage by Destination Airport

# Convert to long format for plotting
airport_long <- airport_summary %>%
  pivot_longer(cols = c(delay_rate, on_time_rate),
               names_to = "status",
               values_to = "rate")
ggplot(airport_long, aes(x = airport, y = rate, fill = status)) +
  geom_col(position = position_dodge(width = 0.6), width = 0.6) +
  geom_text(aes(label = paste0(round(rate*100, 1), "%")),
            position = position_dodge(width = 0.6),
            vjust = -0.5, size = 3) +
  labs(title = "Arrival Delay vs On-time flights percentage by Destination Airport",
       x = "Destination Airport",
       y = "Percentage") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 1.1)) +
  scale_fill_manual(values = c("delay_rate" = "black", "on_time_rate" = "grey"),
                    labels = c("Delay", "On-time")) +
  theme_minimal()


6.3 Delay vs On-time flights percentage by Airline and Destination Airport

airline_airport_summary <- flights %>%
  group_by(airline, airport) %>%
  summarise(
    total_flights = sum(ontime + delay),
    total_delays  = sum(delay),
    total_ontime  = sum(ontime),
    delay_rate    = total_delays / total_flights,
    on_time_rate  = total_ontime / total_flights
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'airline'. You can override using the
## `.groups` argument.
# Reshape for plotting
airline_airport_long <- airline_airport_summary %>%
  pivot_longer(cols = c(delay_rate, on_time_rate),
               names_to = "status",
               values_to = "rate")

# Plot grouped bars
ggplot(airline_airport_long, aes(x = airport, y = rate, fill = status)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = paste0(round(rate*100, 1), "%")),
            position = position_dodge(width = 0.8),
            hjust = -0.1, size = 3) +  # hjust places text in line horizontally
  facet_wrap(~airline, ncol = 1) +
  labs(title = "Delay vs On-time flights percentage by Airport and Airline",
       x = "Destination Airport",
       y = "percentage") +
  scale_y_continuous(labels = percent_format(), limits = c(0, 1.1)) +
  scale_fill_manual(values = c("delay_rate" = "black", "on_time_rate" = "grey"),
                    labels = c("Delay", "On-time")) +
  coord_flip() +  # flip axes to make horizontal bars
  theme_minimal()


Conclusion:

  • The total number of flights for AM_WEST is more than double that of ALASKA.

  • ALASKA has a higher overall delay percentage than AM_WEST by approximately 2.3%.

  • Phoenix handles the highest number of flights, followed by Seattle in second place.

  • San Francisco has the highest delay rate among destinations, with Seattle following.

  • Phoenix has the lowest flight delay rate across all destinations.

  • The ALASKA flights to Phoenix have the lowest individual delay rate, followed closely by AM_WEST flights to Phoenix.

  • AM_WEST flights to San Francisco experience the highest delay rate, followed by ALASKA flights to San Francisco.