Intro: Given a data set containing untidy flight-arrival-delay data,
the task was to perform data frame operations to create a clean or
‘tidy’ data frame that can be further explored for insights.
Read CSV
# Import packages
library(dplyr)
library(ggplot2)
library(stringr)
library(tidyr)
# Read CSV and load into DF1
df1 <- read.csv("/Users/williamberritt/Downloads/assignment_4.csv", na.strings=c("","NA"))
df1
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA <NA> NA NA <NA>
## 4 AM WEST on time 694 4,840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
Check classes to ensure smooth data frame operations. Adjust any
classes that need to be changed.
#Check Classes
lapply(df1, class)
## $X
## [1] "character"
##
## $X.1
## [1] "character"
##
## $Los.Angeles
## [1] "integer"
##
## $Phoenix
## [1] "character"
##
## $San.Diego
## [1] "integer"
##
## $San.Francisco
## [1] "integer"
##
## $Seattle
## [1] "character"
# Update different classes
df1$Los.Angeles <- as.character(df1$Los.Angeles)
df1$San.Diego <- as.character(df1$San.Diego)
df1$San.Francisco <- as.character(df1$San.Francisco)
Hard code in data for the purpose of this exercise. (Given a larger
data set and more time, I would’ve developed a programmatic logic to
iteratively add in each city)
# Input data
df1[2,'X'] <- 'ALASKA'
df1[5,'X'] <- 'AM WEST'
Pivot table
# Pivot the table
df1 <- df1 |> pivot_longer(cols = !(X:X.1),
names_to = 'city',
values_to ='value',
values_drop_na = TRUE)
# Check classes
lapply(df1, class)
## $X
## [1] "character"
##
## $X.1
## [1] "character"
##
## $city
## [1] "character"
##
## $value
## [1] "character"
Perform text operations to make data more readable and prepared for
analysis
# Remove . from cities and , from value
df1$city <- str_replace(df1$city, "\\.", " ")
df1$value <- str_replace(df1$value, ",", "")
df1$value <- as.numeric(df1$value)
# Preview changes
df1
## # A tibble: 20 × 4
## X X.1 city value
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San Diego 383
## 14 AM WEST on time San Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San Diego 65
## 19 AM WEST delayed San Francisco 129
## 20 AM WEST delayed Seattle 61
Create a subset of only delayed flights
# Subset where X.1 = delayed
df_delayed = subset(df1, X.1 == 'delayed')
Create a data frame that has a new column with total flights for
each airline and each city
# Aggregate the indivdual flight values by city and airline
df_total_flights <- df1 |> group_by(X, city) |> summarize(total_flights = sum(value))
## `summarise()` has grouped output by 'X'. You can override using the `.groups`
## argument.
Join data frames to include both total flights and number of delayed
flights
# Left join where dftf.X = dfd.X and dftf.city=dfd.city
df_summarize <- left_join(df_total_flights, df_delayed, by=c('X', 'city'))
# Preview
df_summarize
## # A tibble: 10 × 5
## # Groups: X [2]
## X city total_flights X.1 value
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ALASKA Los Angeles 559 delayed 62
## 2 ALASKA Phoenix 233 delayed 12
## 3 ALASKA San Diego 232 delayed 20
## 4 ALASKA San Francisco 605 delayed 102
## 5 ALASKA Seattle 2146 delayed 305
## 6 AM WEST Los Angeles 811 delayed 117
## 7 AM WEST Phoenix 5255 delayed 415
## 8 AM WEST San Diego 448 delayed 65
## 9 AM WEST San Francisco 449 delayed 129
## 10 AM WEST Seattle 262 delayed 61
Rename columns
# Rename columns
colnames(df_summarize) <- c("airline", "city", 'total_flights', 'status', 'value')
Create delayed flights per 100 column dividing delayed flights by
total flights
# Check classes of new table
lapply(df_summarize, class)
## $airline
## [1] "character"
##
## $city
## [1] "character"
##
## $total_flights
## [1] "numeric"
##
## $status
## [1] "character"
##
## $value
## [1] "numeric"
# Create new delayed flights per 100 column and order by city desc
df_summarize <- df_summarize |> mutate(delayed_flights_per_100 = value/total_flights*100)
df_summarize <- df_summarize[order(df_summarize$city), ]
# Preview
df_summarize
## # A tibble: 10 × 6
## # Groups: airline [2]
## airline city total_flights status value delayed_flights_per_100
## <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 ALASKA Los Angeles 559 delayed 62 11.1
## 2 AM WEST Los Angeles 811 delayed 117 14.4
## 3 ALASKA Phoenix 233 delayed 12 5.15
## 4 AM WEST Phoenix 5255 delayed 415 7.90
## 5 ALASKA San Diego 232 delayed 20 8.62
## 6 AM WEST San Diego 448 delayed 65 14.5
## 7 ALASKA San Francisco 605 delayed 102 16.9
## 8 AM WEST San Francisco 449 delayed 129 28.7
## 9 ALASKA Seattle 2146 delayed 305 14.2
## 10 AM WEST Seattle 262 delayed 61 23.3
Create grouped bar chart so show comparison between delays for each
airline + city
# Plot grouped bar chart to see differences between airlines in cities
ggplot(df_summarize, aes(fill=airline, y=delayed_flights_per_100, x=city)) +
geom_bar(position='dodge', stat='identity')

Conclusion: Don’t fly AM West to any of these cities unless you have
to! Not actually - there are many different factors when assessing the
value of a flight like flight/airline quality, price, etc. Also
important to consider there may be biases in this representation. For
example, if AM West only has flights that are delayed around the
holidays and otherwise runs more efficiently than Alaska, that would be
a critical piece of information that should be researched.