Install and load necessary packages

options(repos = c(CRAN = "https://cran.rstudio.com"))
req_packages <- c("DBI","RMySQL","dplyr","dbplyr","knitr","tidyr", "readr", "stringr","tibble", "rmarkdown", "purrr", "lubridate", "here", "httr2", "RCurl","ggplot2")
for (pkg in req_packages) {
  if (!require(pkg, character.only = TRUE)) {
    message(paste("Installing package:", pkg))
    install.packages(pkg, dependencies = TRUE)
  } else {
    message(paste(pkg, " already installed."))
  }
  library(pkg, character.only = TRUE)
}
## Loading required package: DBI
## DBI  already installed.
## Loading required package: RMySQL
## RMySQL  already installed.
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## dplyr  already installed.
## Loading required package: dbplyr
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
## dbplyr  already installed.
## Loading required package: knitr
## knitr  already installed.
## Loading required package: tidyr
## tidyr  already installed.
## Loading required package: readr
## readr  already installed.
## Loading required package: stringr
## stringr  already installed.
## Loading required package: tibble
## tibble  already installed.
## Loading required package: rmarkdown
## rmarkdown  already installed.
## Loading required package: purrr
## purrr  already installed.
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## lubridate  already installed.
## Loading required package: here
## here() starts at /Users/paulabrown/Documents/CUNY SPS- Data 607/Week 5 Assignments
## here  already installed.
## Loading required package: httr2
## httr2  already installed.
## Loading required package: RCurl
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
## RCurl  already installed.
## Loading required package: ggplot2
## ggplot2  already installed.

Create datafram for provided Airline Delay image

# Create a matrix that mimics the untidy structure
untidy_data <- data.frame(
  X = c("ALASKA", "", "", "AM WEST",""),
  Y = c("on time","delayed","","on time","delayed"),
  `Los Angeles` = c(497, 62, "", 694, 117),
  Phoenix = c(221, 12, "", 4840, 415),
  `San Diego` = c(212, 20, "", 383, 65),
  `San Francisco` = c(503, 102, "", 320, 129),
  Seattle = c(1841, 305, "", 201, 61),
  stringsAsFactors = FALSE
)

# Rename the first column to blank (like in the image)
names(untidy_data)[1] <- ""

Create CSV file for provided image

We need a CSV of untidy data so that we can practice tidying and transforming the data with R.

# Save the untidy data to a CSV file
write.csv(untidy_data, here::here("untidy_flight_data.csv"), row.names = FALSE)

Read the data from the CSV file

After creating the CSV containing untidy data, we must read it in R so we can begin tidying and transforming.

# Read the CSV
untidy_delays <- read.csv(here("untidy_flight_data.csv"), na.strings = c("", "NA"))
## na.strings = c("", "NA")) tells R to treat blanks and NA as missing values

##Summarize the data in the .csv file Let’s take a look at the untidy data we created. Check to see that it reflects the data in the “Airline Delay” image from Source: Numbersense, Kaiser Fung, McGraw Hill, 2013

knitr::kable(untidy_delays)
X Y Los.Angeles Phoenix San.Diego San.Francisco Seattle
ALASKA on time 497 221 212 503 1841
NA delayed 62 12 20 102 305
NA NA NA NA NA NA NA
AM WEST on time 694 4840 383 320 201
NA delayed 117 415 65 129 61
# kable() = creates a nicely formatted table from a data frame or tibble.

Remove NA values

We only want to see the rows with data - remove/filter out blank/NA rows.

## Remove rows with all NA values
untidy_delays <- untidy_delays %>%
  filter(rowSums(is.na(.)) != ncol(.)) ## Filter to show rows that are NOT "NA"

Fill in missing airline names

Associate the airline with the row containing data for the “delayed” status.

## Fill missing airline names
tidy_delays <- untidy_delays %>%
  tidyr::fill(X, .direction = "down") ##fill() tells R to fill in the column "X" and fill it going in a downward direction.

Quick Preview to determine next steps

tidy_delays
##         X       Y Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61

Pivot table

Transform table from wide to long by combining City names into 1 column named “City” and their values in another column named “Delay”

## Pivot city columns into long format
tidy_delays <- tidy_delays %>%
  pivot_longer(cols = -c(X, Y), names_to = "City", values_to = "Delay")

Rename X and Y columns accordingly

Rename for clarity.

tidy_delays <- tidy_delays %>%
  rename(Airline = X, Status = Y)

View cleaned data

knitr::kable(tidy_delays)
Airline Status City Delay
ALASKA on time Los.Angeles 497
ALASKA on time Phoenix 221
ALASKA on time San.Diego 212
ALASKA on time San.Francisco 503
ALASKA on time Seattle 1841
ALASKA delayed Los.Angeles 62
ALASKA delayed Phoenix 12
ALASKA delayed San.Diego 20
ALASKA delayed San.Francisco 102
ALASKA delayed Seattle 305
AM WEST on time Los.Angeles 694
AM WEST on time Phoenix 4840
AM WEST on time San.Diego 383
AM WEST on time San.Francisco 320
AM WEST on time Seattle 201
AM WEST delayed Los.Angeles 117
AM WEST delayed Phoenix 415
AM WEST delayed San.Diego 65
AM WEST delayed San.Francisco 129
AM WEST delayed Seattle 61

Summarize the data

Based on the overall averages below, we can conclude that both airlines are mostly on time.

# Summary statistics
summary_stats <- tidy_delays %>%
  group_by(Airline,Status) %>%
  summarise(Average_Delay = mean(Delay), .groups = "drop")

print(summary_stats)
## # A tibble: 4 × 3
##   Airline Status  Average_Delay
##   <chr>   <chr>           <dbl>
## 1 ALASKA  delayed          100.
## 2 ALASKA  on time          655.
## 3 AM WEST delayed          157.
## 4 AM WEST on time         1288.

Calculate the percentage of the overall summary

# Summarize total flights per airline and status
overall_summary <- tidy_delays %>%
  group_by(Airline, Status) %>%
  summarise(Total = sum(Delay), .groups = "drop")

# Calculate percentage of each status per airline
overall_summary <- overall_summary %>%
  group_by(Airline) %>%
  mutate(Percentage = round((Total / sum(Total)) * 100, 2))
print(overall_summary)
## # A tibble: 4 × 4
## # Groups:   Airline [2]
##   Airline Status  Total Percentage
##   <chr>   <chr>   <int>      <dbl>
## 1 ALASKA  delayed   501       13.3
## 2 ALASKA  on time  3274       86.7
## 3 AM WEST delayed   787       10.9
## 4 AM WEST on time  6438       89.1

Summarize the data

On average both airlines perform well as they are often on time than and rarely delayed. It appears that there are more flights to Phoenix via AM West airline, and more flights to Seattle via Alaska airline. Overall, it seems AM West is a more popular airline and may have more flights which can skew the averages in its favor. Since we do not know the number of flights for each airline’s city we cannot really compare or give a recommendation. Another thing to factor in is how close in proximity each airport is to the destinations.

# Summary statistics
summary_stats <- tidy_delays %>%
  group_by(Airline,Status,City) %>%
  summarise(Average_Delay = mean(Delay), .groups = "drop")

print(summary_stats)
## # A tibble: 20 × 4
##    Airline Status  City          Average_Delay
##    <chr>   <chr>   <chr>                 <dbl>
##  1 ALASKA  delayed Los.Angeles              62
##  2 ALASKA  delayed Phoenix                  12
##  3 ALASKA  delayed San.Diego                20
##  4 ALASKA  delayed San.Francisco           102
##  5 ALASKA  delayed Seattle                 305
##  6 ALASKA  on time Los.Angeles             497
##  7 ALASKA  on time Phoenix                 221
##  8 ALASKA  on time San.Diego               212
##  9 ALASKA  on time San.Francisco           503
## 10 ALASKA  on time Seattle                1841
## 11 AM WEST delayed Los.Angeles             117
## 12 AM WEST delayed Phoenix                 415
## 13 AM WEST delayed San.Diego                65
## 14 AM WEST delayed San.Francisco           129
## 15 AM WEST delayed Seattle                  61
## 16 AM WEST on time Los.Angeles             694
## 17 AM WEST on time Phoenix                4840
## 18 AM WEST on time San.Diego               383
## 19 AM WEST on time San.Francisco           320
## 20 AM WEST on time Seattle                 201

Plot Overall Summary

# Plot overall percentages
ggplot(overall_summary, aes(x = Airline, y = Percentage, fill = Status)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Overall Arrival vs Delay Percentages by Airline",
       y = "Percentage (%)", x = "Airline") +
  scale_fill_manual(values = c("on time" = "lightblue", "delayed" = "pink")) +
  theme_minimal()

## Summarize flights per airline, status, and city
city_summary <- tidy_delays %>%
  group_by(Airline, City, Status) %>%
  summarise(Total = sum(Delay), .groups = "drop")

## Calculate percentage of each status per airline per city
city_summary <- city_summary %>%
  group_by(Airline, City) %>%
  mutate(Percentage = round((Total / sum(Total)) * 100, 2))
print(city_summary)
## # A tibble: 20 × 5
## # Groups:   Airline, City [10]
##    Airline City          Status  Total Percentage
##    <chr>   <chr>         <chr>   <int>      <dbl>
##  1 ALASKA  Los.Angeles   delayed    62      11.1 
##  2 ALASKA  Los.Angeles   on time   497      88.9 
##  3 ALASKA  Phoenix       delayed    12       5.15
##  4 ALASKA  Phoenix       on time   221      94.8 
##  5 ALASKA  San.Diego     delayed    20       8.62
##  6 ALASKA  San.Diego     on time   212      91.4 
##  7 ALASKA  San.Francisco delayed   102      16.9 
##  8 ALASKA  San.Francisco on time   503      83.1 
##  9 ALASKA  Seattle       delayed   305      14.2 
## 10 ALASKA  Seattle       on time  1841      85.8 
## 11 AM WEST Los.Angeles   delayed   117      14.4 
## 12 AM WEST Los.Angeles   on time   694      85.6 
## 13 AM WEST Phoenix       delayed   415       7.9 
## 14 AM WEST Phoenix       on time  4840      92.1 
## 15 AM WEST San.Diego     delayed    65      14.5 
## 16 AM WEST San.Diego     on time   383      85.5 
## 17 AM WEST San.Francisco delayed   129      28.7 
## 18 AM WEST San.Francisco on time   320      71.3 
## 19 AM WEST Seattle       delayed    61      23.3 
## 20 AM WEST Seattle       on time   201      76.7

Plot City Summary

# Plot overall percentages
ggplot(city_summary, aes(x = City, y = Percentage, fill = Status)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "City Arrival vs City Percentages by Airline",
       y = "Percentage (%)", x = "Airline") +
  scale_fill_manual(values = c("on time" = "gray", "delayed" ="orange")) +
  theme_minimal()