Intro: Given a data set containing untidy flight-arrival-delay data, the task was to perform data frame operations to create a clean or ‘tidy’ data frame that can be further explored for insights.

Read CSV

# Import packages
library(dplyr)
library(ggplot2)
library(stringr)
library(tidyr)
# Read CSV and load into DF1
df1 <- read.csv("/Users/williamberritt/Downloads/assignment_4.csv", na.strings=c("","NA")) 
df1 
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503   1,841
## 2    <NA> delayed          62      12        20           102     305
## 3    <NA>    <NA>          NA    <NA>        NA            NA    <NA>
## 4 AM WEST on time         694   4,840       383           320     201
## 5    <NA> delayed         117     415        65           129      61

Check classes to ensure smooth data frame operations. Adjust any classes that need to be changed.

#Check Classes
lapply(df1, class)
## $X
## [1] "character"
## 
## $X.1
## [1] "character"
## 
## $Los.Angeles
## [1] "integer"
## 
## $Phoenix
## [1] "character"
## 
## $San.Diego
## [1] "integer"
## 
## $San.Francisco
## [1] "integer"
## 
## $Seattle
## [1] "character"
# Update different classes
df1$Los.Angeles <- as.character(df1$Los.Angeles)
df1$San.Diego <- as.character(df1$San.Diego)
df1$San.Francisco <- as.character(df1$San.Francisco)

Hard code in data for the purpose of this exercise. (Given a larger data set and more time, I would’ve developed a programmatic logic to iteratively add in each city)

# Input data
df1[2,'X'] <- 'ALASKA'
df1[5,'X'] <- 'AM WEST'

Pivot table

# Pivot the table 
df1 <- df1 |> pivot_longer(cols = !(X:X.1),
                   names_to = 'city',
                   values_to ='value',
                   values_drop_na = TRUE)
# Check classes
lapply(df1, class)
## $X
## [1] "character"
## 
## $X.1
## [1] "character"
## 
## $city
## [1] "character"
## 
## $value
## [1] "character"

Perform text operations to make data more readable and prepared for analysis

# Remove . from cities and , from value
df1$city <- str_replace(df1$city, "\\.", " ")
df1$value <- str_replace(df1$value, ",", "")
df1$value <- as.numeric(df1$value)
# Preview changes
df1
## # A tibble: 20 × 4
##    X       X.1     city          value
##    <chr>   <chr>   <chr>         <dbl>
##  1 ALASKA  on time Los Angeles     497
##  2 ALASKA  on time Phoenix         221
##  3 ALASKA  on time San Diego       212
##  4 ALASKA  on time San Francisco   503
##  5 ALASKA  on time Seattle        1841
##  6 ALASKA  delayed Los Angeles      62
##  7 ALASKA  delayed Phoenix          12
##  8 ALASKA  delayed San Diego        20
##  9 ALASKA  delayed San Francisco   102
## 10 ALASKA  delayed Seattle         305
## 11 AM WEST on time Los Angeles     694
## 12 AM WEST on time Phoenix        4840
## 13 AM WEST on time San Diego       383
## 14 AM WEST on time San Francisco   320
## 15 AM WEST on time Seattle         201
## 16 AM WEST delayed Los Angeles     117
## 17 AM WEST delayed Phoenix         415
## 18 AM WEST delayed San Diego        65
## 19 AM WEST delayed San Francisco   129
## 20 AM WEST delayed Seattle          61

Create a subset of only delayed flights

# Subset where X.1 = delayed
df_delayed = subset(df1, X.1 == 'delayed')

Create a data frame that has a new column with total flights for each airline and each city

# Aggregate the indivdual flight values by city and airline
df_total_flights <- df1 |> group_by(X, city) |> summarize(total_flights = sum(value))
## `summarise()` has grouped output by 'X'. You can override using the `.groups`
## argument.

Join data frames to include both total flights and number of delayed flights

# Left join where dftf.X = dfd.X and dftf.city=dfd.city
df_summarize <- left_join(df_total_flights, df_delayed, by=c('X', 'city'))
# Preview
df_summarize
## # A tibble: 10 × 5
## # Groups:   X [2]
##    X       city          total_flights X.1     value
##    <chr>   <chr>                 <dbl> <chr>   <dbl>
##  1 ALASKA  Los Angeles             559 delayed    62
##  2 ALASKA  Phoenix                 233 delayed    12
##  3 ALASKA  San Diego               232 delayed    20
##  4 ALASKA  San Francisco           605 delayed   102
##  5 ALASKA  Seattle                2146 delayed   305
##  6 AM WEST Los Angeles             811 delayed   117
##  7 AM WEST Phoenix                5255 delayed   415
##  8 AM WEST San Diego               448 delayed    65
##  9 AM WEST San Francisco           449 delayed   129
## 10 AM WEST Seattle                 262 delayed    61

Rename columns

# Rename columns
colnames(df_summarize) <- c("airline", "city", 'total_flights', 'status', 'value')

Create delayed flights per 100 column dividing delayed flights by total flights

# Check classes of new table
lapply(df_summarize, class)
## $airline
## [1] "character"
## 
## $city
## [1] "character"
## 
## $total_flights
## [1] "numeric"
## 
## $status
## [1] "character"
## 
## $value
## [1] "numeric"
# Create new delayed flights per 100 column and order by city desc
df_summarize <- df_summarize |> mutate(delayed_flights_per_100 = value/total_flights*100)
df_summarize <- df_summarize[order(df_summarize$city), ]

# Preview
df_summarize
## # A tibble: 10 × 6
## # Groups:   airline [2]
##    airline city          total_flights status  value delayed_flights_per_100
##    <chr>   <chr>                 <dbl> <chr>   <dbl>                   <dbl>
##  1 ALASKA  Los Angeles             559 delayed    62                   11.1 
##  2 AM WEST Los Angeles             811 delayed   117                   14.4 
##  3 ALASKA  Phoenix                 233 delayed    12                    5.15
##  4 AM WEST Phoenix                5255 delayed   415                    7.90
##  5 ALASKA  San Diego               232 delayed    20                    8.62
##  6 AM WEST San Diego               448 delayed    65                   14.5 
##  7 ALASKA  San Francisco           605 delayed   102                   16.9 
##  8 AM WEST San Francisco           449 delayed   129                   28.7 
##  9 ALASKA  Seattle                2146 delayed   305                   14.2 
## 10 AM WEST Seattle                 262 delayed    61                   23.3

Create grouped bar chart so show comparison between delays for each airline + city

# Plot grouped bar chart to see differences between airlines in cities
ggplot(df_summarize, aes(fill=airline, y=delayed_flights_per_100, x=city)) + 
  geom_bar(position='dodge', stat='identity')

Conclusion: Don’t fly AM West to any of these cities unless you have to! Not actually - there are many different factors when assessing the value of a flight like flight/airline quality, price, etc. Also important to consider there may be biases in this representation. For example, if AM West only has flights that are delayed around the holidays and otherwise runs more efficiently than Alaska, that would be a critical piece of information that should be researched.